tokenizers/tokenizer/
normalizer.rs

1use crate::pattern::Pattern;
2use crate::{Offsets, Result};
3use std::ops::{Bound, RangeBounds};
4use unicode_normalization_alignments::UnicodeNormalization;
5
6use serde::{Deserialize, Serialize};
7
8/// The possible offsets referential
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum OffsetReferential {
11    Original,
12    Normalized,
13}
14
15/// Represents a Range usable by the NormalizedString to index its content.
16/// A Range can use indices relative to either the `Original` or the `Normalized` string
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub enum Range<T: RangeBounds<usize> + Clone> {
19    Original(T),
20    Normalized(T),
21}
22
23#[allow(clippy::len_without_is_empty)]
24impl<T> Range<T>
25where
26    T: RangeBounds<usize> + Clone,
27{
28    /// Unwrap the underlying range
29    pub fn unwrap(self) -> T {
30        match self {
31            Self::Original(r) => r,
32            Self::Normalized(r) => r,
33        }
34    }
35
36    /// Return the length of the current Range if not Unbounded
37    pub fn len(&self) -> Option<usize> {
38        let range = self.clone().unwrap();
39
40        let end = match range.end_bound() {
41            Bound::Unbounded => None,
42            Bound::Included(i) => Some(*i + 1),
43            Bound::Excluded(i) => Some(*i),
44        }?;
45
46        match range.start_bound() {
47            Bound::Unbounded => Some(end),
48            Bound::Included(i) => Some(end - *i),
49            Bound::Excluded(i) => Some(end - (*i + 1)),
50        }
51    }
52
53    /// Converts the current Range to a `std::ops::Range<usize>`. This requires the `max_len`
54    /// of the represented string (in chars, not bytes) in order to cover the case where the
55    /// original provided range was unbounded
56    pub fn into_full_range(self, max_len: usize) -> std::ops::Range<usize> {
57        let range = self.unwrap();
58
59        let start = match range.start_bound() {
60            Bound::Unbounded => 0,
61            Bound::Included(i) => *i,
62            Bound::Excluded(i) => *i + 1,
63        };
64        let end = match range.end_bound() {
65            Bound::Unbounded => max_len,
66            Bound::Included(i) => *i + 1,
67            Bound::Excluded(i) => *i,
68        };
69
70        start..end
71    }
72}
73
74/// Defines the expected behavior for the delimiter of a Split Pattern
75/// When splitting on `'-'` for example, with input `the-final--countdown`:
76///  - Removed => `[ "the", "final", "countdown" ]`
77///  - Isolated => `[ "the", "-", "final", "-", "-", "countdown" ]`
78///  - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
79///  - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
80///  - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
81#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
82pub enum SplitDelimiterBehavior {
83    Removed,
84    Isolated,
85    MergedWithPrevious,
86    MergedWithNext,
87    Contiguous,
88}
89
90/// A `NormalizedString` takes care of processing an "original" string to modify
91/// it and obtain a "normalized" string. It keeps both version of the string,
92/// alignments information between both and provides an interface to retrieve
93/// ranges of each string, using offsets from any of them.
94///
95/// It is possible to retrieve a part of the original string, by indexing it with
96/// offsets from the normalized one, and the other way around too. It is also
97/// possible to convert offsets from one referential to the other one easily.
98#[derive(Default, Debug, Clone, PartialEq, Eq)]
99pub struct NormalizedString {
100    /// The original version of the string, before any modification
101    original: String,
102    /// The normalized version of the string, after all modifications
103    normalized: String,
104    /// Mapping from normalized string to original one: (start, end) for each
105    /// byte of the normalized string
106    alignments: Vec<(usize, usize)>,
107    /// If this NormalizedString is a slice of a bigger one, we keep the track
108    /// of the missing part, so that we can still give offsets from this original
109    /// string.
110    original_shift: usize,
111}
112
113impl NormalizedString {
114    #[cfg(test)]
115    pub(crate) fn new(
116        original: String,
117        normalized: String,
118        alignments: Vec<(usize, usize)>,
119        original_shift: usize,
120    ) -> Self {
121        Self {
122            original,
123            normalized,
124            alignments,
125            original_shift,
126        }
127    }
128    /// Return the normalized string
129    pub fn get(&self) -> &str {
130        &self.normalized
131    }
132
133    /// Return the original string
134    pub fn get_original(&self) -> &str {
135        &self.original
136    }
137
138    /// Return the original offsets
139    pub fn offsets_original(&self) -> Offsets {
140        (
141            self.original_shift,
142            self.original_shift + self.len_original(),
143        )
144    }
145
146    /// Convert the given offsets range from one referential to the other one:
147    /// `Original => Normalized` or `Normalized => Original`
148    ///
149    /// Returns `None` when targeting something that is outside range
150    pub fn convert_offsets<T>(&self, range: Range<T>) -> Option<std::ops::Range<usize>>
151    where
152        T: RangeBounds<usize> + Clone,
153    {
154        let len_original = self.len_original();
155        let len_normalized = self.len();
156
157        let (target, original) = match range {
158            Range::Original(_) => (range.into_full_range(len_original), true),
159            Range::Normalized(_) => (range.into_full_range(len_normalized), false),
160        };
161
162        // If we target an empty range, let's return the same
163        if target.start == target.end {
164            return Some(target);
165        }
166        // If the target goes reverse, return None
167        if target.start > target.end {
168            return None;
169        }
170
171        // If we target 0..0 on an empty string, we want to expand to the entire equivalent
172        if original && self.original.is_empty() && target == (0..0) {
173            return Some(0..len_normalized);
174        }
175        if !original && self.normalized.is_empty() && target == (0..0) {
176            return Some(0..len_original);
177        }
178
179        if original {
180            let (mut start, mut end) = (None, None);
181            self.alignments
182                .iter()
183                .enumerate()
184                .take_while(|(_, alignment)| target.end >= alignment.1)
185                .for_each(|(i, alignment)| {
186                    if start.is_none() && target.start <= alignment.0 {
187                        // For now, don't update if width == 0
188                        if alignment.0 != alignment.1 {
189                            start = Some(i);
190                        }
191                    }
192                    if target.end >= alignment.1 {
193                        end = Some(i + 1);
194                    }
195                });
196
197            match (start, end) {
198                // Targetting inexistant beginning
199                (Some(s), None) => Some(s..s),
200                // Targetting inexistant end
201                (None, Some(e)) => Some(e..e),
202                // Found the range
203                (Some(s), Some(e)) => Some(s..e),
204                _ => None,
205            }
206        } else {
207            self.alignments.get(target).and_then(expand_alignments)
208        }
209    }
210
211    /// Return a range of the normalized string
212    pub fn get_range<T>(&self, range: Range<T>) -> Option<&str>
213    where
214        T: RangeBounds<usize> + Clone,
215    {
216        match range {
217            Range::Original(_) => self.normalized.get(self.convert_offsets(range)?),
218            Range::Normalized(_) => self.normalized.get(range.into_full_range(self.len())),
219        }
220    }
221
222    /// Return a range of the original string
223    pub fn get_range_original<T>(&self, range: Range<T>) -> Option<&str>
224    where
225        T: RangeBounds<usize> + Clone,
226    {
227        match range {
228            Range::Original(_) => self
229                .original
230                .get(range.into_full_range(self.len_original())),
231            Range::Normalized(_) => self.original.get(self.convert_offsets(range)?),
232        }
233    }
234
235    /// Validate the given range, to make sure it is on char boundaries
236    fn validate_range<T: RangeBounds<usize> + Clone>(
237        &self,
238        range: Range<T>,
239    ) -> Option<Range<std::ops::Range<usize>>> {
240        match range {
241            Range::Original(_) => {
242                let r = range.into_full_range(self.original.len());
243                if !(self.original.is_char_boundary(r.start)
244                    && self.original.is_char_boundary(r.end))
245                {
246                    None
247                } else {
248                    Some(Range::Original(r))
249                }
250            }
251            Range::Normalized(_) => {
252                let r = range.into_full_range(self.normalized.len());
253                if !(self.normalized.is_char_boundary(r.start)
254                    && self.normalized.is_char_boundary(r.end))
255                {
256                    None
257                } else {
258                    Some(Range::Normalized(r))
259                }
260            }
261        }
262    }
263
264    /// Return a slice of the current NormalizedString
265    /// If the range is not on char boundaries, return None
266    pub fn slice<T>(&self, range: Range<T>) -> Option<NormalizedString>
267    where
268        T: RangeBounds<usize> + Clone,
269    {
270        let full_range = self.validate_range(range)?;
271        let (normalized_range, original_range) = match full_range {
272            Range::Original(_) => (
273                self.convert_offsets(full_range.clone())?,
274                full_range.clone().unwrap(),
275            ),
276            Range::Normalized(_) => (
277                full_range.clone().unwrap(),
278                self.convert_offsets(full_range.clone())?,
279            ),
280        };
281
282        let n_shift = original_range.start;
283
284        Some(Self {
285            original: self
286                .get_range_original(full_range.clone())
287                .unwrap_or_default()
288                .into(),
289            normalized: self.get_range(full_range).unwrap_or_default().into(),
290            alignments: self
291                .alignments
292                .get(normalized_range)?
293                .to_vec()
294                .iter()
295                .map(|(start, end)| (start - n_shift, end - n_shift))
296                .collect(),
297            original_shift: self.original_shift + original_range.start,
298        })
299    }
300
301    /// Applies transformations to the current normalized version of the string,
302    /// while updating the alignments.
303    /// This method expect an Iterator yielding each char of the new normalized string
304    /// with a `change` isize equals to:
305    ///   - `1` if this is a new char
306    ///   - `-N` if the char is right before N removed chars
307    ///   - `0` if the char is replacing the existing one
308    ///
309    /// Since it is possible that the normalized string doesn't include some of the characters at
310    /// the beginning of the original one, we need an `initial_offset` which represents the number
311    /// of removed chars at the very beginning.
312    pub fn transform_range<T, I>(&mut self, range: Range<T>, dest: I, initial_offset: usize)
313    where
314        T: RangeBounds<usize> + Clone,
315        I: IntoIterator<Item = (char, isize)>,
316    {
317        let n_range = match range {
318            Range::Normalized(_) => range.into_full_range(self.len()),
319            Range::Original(_) => match self.convert_offsets(range) {
320                Some(range) => range,
321                None => return,
322            },
323        };
324        trace!(
325            "===== transform_range call with {:?} (initial_offset: {}) =====",
326            n_range,
327            initial_offset
328        );
329
330        // Retrieve the original characters that are being replaced. This let us
331        // compute the change in byte sizes along the way.
332        let mut replaced_normalized = self.normalized[n_range.clone()]
333            .chars()
334            .collect::<Vec<_>>()
335            .into_iter();
336        let initial_removed: usize = (&mut replaced_normalized)
337            .take(initial_offset)
338            .map(|c| c.len_utf8())
339            .sum();
340
341        let mut offset = (initial_removed + n_range.start) as isize;
342        let mut alignments = Vec::with_capacity(n_range.len());
343        trace!("=> Applying transformations");
344        let normalized = dest
345            .into_iter()
346            .map(|(c, changes)| {
347                trace!(
348                    "### {:?} with size {}: {} with offset {} ###",
349                    c,
350                    c.len_utf8(),
351                    match changes {
352                        0 => "Replacing".into(),
353                        ch if ch > 0 => "Adding".into(),
354                        ch if ch < 0 => format!("Replacing + removing {ch} following chars"),
355                        _ => "Undefined".into(),
356                    },
357                    offset
358                );
359
360                let idx = offset as usize;
361                let align = if changes.is_positive() {
362                    if idx < 1 {
363                        (0, 0)
364                    } else {
365                        // This is a newly inserted character, so it shares the same alignment
366                        // than the previous one
367                        self.alignments[idx - 1]
368                    }
369                } else {
370                    self.alignments[idx]
371                };
372
373                // If we are replacing a character, find it and compute the change in size
374                let replaced_char = if !changes.is_positive() {
375                    replaced_normalized.next()
376                } else {
377                    None
378                };
379                let replaced_char_size = replaced_char.map_or(0, |c| c.len_utf8());
380                let replaced_char_size_change = c.len_utf8() as isize - replaced_char_size as isize;
381                if let Some(ref replaced_char) = replaced_char {
382                    trace!(
383                        "Replacing char {:?} - with a change in size: {}",
384                        replaced_char,
385                        replaced_char_size_change
386                    );
387                }
388
389                // If we are removing some characters, find them too
390                let total_bytes_to_remove = if changes.is_negative() {
391                    (&mut replaced_normalized)
392                        .take(-changes as usize)
393                        .map(|c| c.len_utf8())
394                        .sum()
395                } else {
396                    0
397                };
398                trace!("Total bytes to remove: {}", total_bytes_to_remove);
399
400                // Keep track of the changes for next offsets
401                offset += replaced_char_size as isize;
402                offset += total_bytes_to_remove as isize;
403                trace!("New offset: {}", offset);
404
405                trace!("New normalized alignment: {}x {:?}", c.len_utf8(), align);
406                alignments.extend((0..c.len_utf8()).map(|_| align));
407
408                // Then we keep only the char for string reconstruction
409                c
410            })
411            .collect::<String>();
412
413        self.alignments.splice(n_range.clone(), alignments);
414
415        // This bounds check already happens above (`self.normalized[n_range.clone()]`), but future
416        // code could change to mutate `self` or `self.normalized` in the interim.
417        // Perform it again and hope the optimizer collapses it.
418        assert!(self.normalized.get(n_range.clone()).is_some());
419        unsafe {
420            self.normalized
421                // Safety: This is safe as long as we do not splice across a
422                // UTF-8 character, and we only add UTF-8 text. `normalized` is a String
423                // so the latter is trivially true, and we assert for the former above.
424                .as_mut_vec()
425                .splice(n_range, normalized.bytes());
426        }
427    }
428
429    /// Applies transformations to the current normalized version of the string,
430    /// while updating the alignments.
431    /// This method expect an Iterator yielding each char of the new normalized string
432    /// with a `change` isize equals to:
433    ///   - `1` if this is a new char
434    ///   - `-N` if the char is right before N removed chars
435    ///   - `0` if the char is replacing the existing one
436    ///
437    /// Since it is possible that the normalized string doesn't include some of the characters at
438    /// the beginning of the original one, we need an `initial_offset` which represents the number
439    /// of removed chars at the very beginning.
440    pub fn transform<I>(&mut self, dest: I, initial_offset: usize)
441    where
442        I: IntoIterator<Item = (char, isize)>,
443    {
444        self.transform_range(Range::Original(..), dest, initial_offset)
445    }
446
447    /// Applies NFD normalization
448    pub fn nfd(&mut self) -> &mut Self {
449        self.transform(self.get().to_owned().nfd(), 0);
450        self
451    }
452
453    /// Applies NFKD normalization
454    pub fn nfkd(&mut self) -> &mut Self {
455        self.transform(self.get().to_owned().nfkd(), 0);
456        self
457    }
458
459    /// Applies NFC normalization
460    pub fn nfc(&mut self) -> &mut Self {
461        self.transform(self.get().to_owned().nfc(), 0);
462        self
463    }
464
465    /// Applies NFKC normalization
466    pub fn nfkc(&mut self) -> &mut Self {
467        self.transform(self.get().to_owned().nfkc(), 0);
468        self
469    }
470
471    /// Applies filtering over our characters
472    pub fn filter<F: Fn(char) -> bool>(&mut self, keep: F) -> &mut Self {
473        let mut removed: isize = 0;
474        let mut removed_start: usize = 0;
475
476        let mut transforms = Vec::with_capacity(self.normalized.len());
477        let mut last_c = None;
478        for c in self.normalized.chars() {
479            if keep(c) {
480                match last_c {
481                    Some(lc) => {
482                        transforms.push((lc, -removed));
483                    }
484                    None => {
485                        removed_start = removed as usize;
486                    }
487                }
488                last_c = Some(c);
489                removed = 0;
490            } else {
491                removed += 1;
492            }
493        }
494        if let Some(lc) = last_c {
495            transforms.push((lc, -removed));
496        }
497        self.transform(transforms, removed_start);
498        self
499    }
500
501    /// Prepend the given string to ourself
502    pub fn prepend(&mut self, s: &str) -> &mut Self {
503        if let Some(next) = self.normalized.chars().next() {
504            let transformations = s
505                .chars()
506                .enumerate()
507                .map(|(i, c)| (c, isize::from(i != 0)))
508                .chain(std::iter::once((next, 1)));
509
510            self.transform_range(Range::Normalized(0..next.len_utf8()), transformations, 0);
511        }
512        self
513    }
514
515    /// Append the given string to ourself
516    pub fn append(&mut self, s: &str) -> &mut Self {
517        if let Some((b, prev)) = self.normalized.char_indices().last() {
518            let transformations = std::iter::once((prev, 0)).chain(s.chars().map(|c| (c, 1)));
519            self.transform_range(Range::Normalized(b..), transformations, 0);
520        }
521        self
522    }
523
524    /// Map our characters
525    pub fn map<F: Fn(char) -> char>(&mut self, map: F) -> &mut Self {
526        let transformations = self
527            .normalized
528            .chars()
529            .map(|c| (map(c), 0))
530            .collect::<Vec<_>>();
531        self.transform(transformations, 0);
532        self
533    }
534
535    /// Calls the given function for each characters
536    pub fn for_each<F: FnMut(char)>(&self, foreach: F) -> &Self {
537        self.normalized.chars().for_each(foreach);
538        self
539    }
540
541    /// Lowercase
542    pub fn lowercase(&mut self) -> &mut Self {
543        let mut new_chars: Vec<(char, isize)> = vec![];
544        self.for_each(|c| {
545            c.to_lowercase().enumerate().for_each(|(index, c)| {
546                new_chars.push((c, isize::from(index > 0)));
547            })
548        });
549        self.transform(new_chars, 0);
550        self
551    }
552
553    /// Uppercase
554    pub fn uppercase(&mut self) -> &mut Self {
555        let mut new_chars: Vec<(char, isize)> = vec![];
556        self.for_each(|c| {
557            c.to_uppercase().enumerate().for_each(|(index, c)| {
558                new_chars.push((c, isize::from(index > 0)));
559            })
560        });
561        self.transform(new_chars, 0);
562        self
563    }
564
565    /// Replace anything that matches the pattern with the given content.
566    pub fn replace<P: Pattern>(&mut self, pattern: P, content: &str) -> Result<()> {
567        let mut new_normalized = String::with_capacity(self.normalized.len()); // Initially allocate for the input size
568        let mut new_alignments: Vec<(usize, usize)> = Vec::with_capacity(self.alignments.len());
569        let mut last_end = 0; // Keep track of the last end position
570
571        pattern
572            .find_matches(&self.normalized)?
573            .into_iter()
574            .for_each(|((start, end), is_match)| {
575                if is_match {
576                    let range = start..end;
577
578                    let mut new_len = 0;
579                    let removed_chars = self.normalized[range.clone()].chars().count();
580
581                    /* The following code is equivalent to this call, but computationally much more efficient
582                    self.transform_range(
583                        Range::Normalized(range),
584                        content.chars().map(|c| {
585                            new_len += c.len_utf8();
586                            (c, 1)
587                        }),
588                        removed_chars,
589                    ); */
590
591                    // Copy the part of the string that is before the match
592                    new_normalized.push_str(&self.normalized[last_end..start]);
593                    new_alignments.extend(self.alignments[last_end..start].iter().cloned());
594
595                    let n_range = Range::Normalized(range).into_full_range(self.len());
596
597                    // Retrieve the original characters that are being replaced. This let us
598                    // compute the change in byte sizes along the way.
599                    let mut replaced_normalized = self.normalized[n_range.clone()]
600                        .chars()
601                        .collect::<Vec<_>>()
602                        .into_iter();
603                    let initial_removed: usize = (&mut replaced_normalized)
604                        .take(removed_chars)
605                        .map(|c| c.len_utf8())
606                        .sum();
607
608                    let dest = content.chars().map(|c| {
609                        new_len += c.len_utf8();
610                        (c, 1)
611                    });
612                    let mut offset = (initial_removed + n_range.start) as isize;
613                    let normalized = dest
614                        .into_iter()
615                        .map(|(c, changes): (char, i32)| {
616                            let idx = offset as usize;
617                            let align = if changes.is_positive() {
618                                if idx < 1 {
619                                    (0, 0)
620                                } else {
621                                    // This is a newly inserted character, so it shares the same alignment
622                                    // than the previous one
623                                    self.alignments[idx - 1]
624                                }
625                            } else {
626                                self.alignments[idx]
627                            };
628
629                            // If we are replacing a character, find it and compute the change in size
630                            let replaced_char = if !changes.is_positive() {
631                                replaced_normalized.next()
632                            } else {
633                                None
634                            };
635                            let replaced_char_size = replaced_char.map_or(0, |c| c.len_utf8());
636
637                            // If we are removing some characters, find them too
638                            let total_bytes_to_remove = if changes.is_negative() {
639                                (&mut replaced_normalized)
640                                    .take(-changes as usize)
641                                    .map(|c| c.len_utf8())
642                                    .sum()
643                            } else {
644                                0
645                            };
646
647                            // Keep track of the changes for next offsets
648                            offset += replaced_char_size as isize;
649                            offset += total_bytes_to_remove as isize;
650
651                            new_alignments.extend((0..c.len_utf8()).map(|_| align));
652
653                            // Then we keep only the char for string reconstruction
654                            c
655                        })
656                        .collect::<String>();
657
658                    new_normalized.push_str(&normalized);
659                    last_end = end;
660                }
661            });
662
663        // Copy the remaining part of the input
664        new_normalized.push_str(&self.normalized[last_end..]);
665        new_alignments.extend(&self.alignments[last_end..]);
666
667        self.normalized = new_normalized;
668        self.alignments = new_alignments;
669        Ok(())
670    }
671
672    /// Clear the normalized part of the string
673    pub fn clear(&mut self) -> usize {
674        let len = self.len();
675        self.transform(std::iter::empty(), len);
676        len
677    }
678
679    /// Split the current string in many subparts. Specify what to do with the
680    /// delimiter.
681    ///
682    /// ## Splitting Behavior for the delimiter
683    ///
684    /// The behavior can be one of the followings:
685    /// When splitting on `'-'` for example, with input `the-final--countdown`:
686    ///  - Removed => `[ "the", "", "final", "", "", "countdown" ]`
687    ///  - Isolated => `[ "the", "-", "final", "-", "-", "countdown" ]`
688    ///  - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
689    ///  - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
690    pub fn split<P: Pattern>(
691        &self,
692        pattern: P,
693        behavior: SplitDelimiterBehavior,
694    ) -> Result<Vec<NormalizedString>> {
695        let matches = pattern.find_matches(&self.normalized)?;
696
697        // Process the matches according to the selected behavior: Vec<(Offsets, should_remove)>
698        use SplitDelimiterBehavior::*;
699        let splits = match behavior {
700            Isolated => matches
701                .into_iter()
702                .map(|(offsets, _)| (offsets, false))
703                .collect(),
704            Removed => matches,
705            Contiguous => {
706                let mut previous_match = false;
707                matches
708                    .into_iter()
709                    .fold(vec![], |mut acc, (offsets, is_match)| {
710                        if is_match == previous_match {
711                            if let Some(((_, end), _)) = acc.last_mut() {
712                                *end = offsets.1;
713                            } else {
714                                acc.push((offsets, false));
715                            }
716                        } else {
717                            acc.push((offsets, false));
718                        }
719                        previous_match = is_match;
720                        acc
721                    })
722            }
723            MergedWithPrevious => {
724                let mut previous_match = false;
725                matches
726                    .into_iter()
727                    .fold(vec![], |mut acc, (offsets, is_match)| {
728                        if is_match && !previous_match {
729                            if let Some(((_, end), _)) = acc.last_mut() {
730                                *end = offsets.1;
731                            } else {
732                                acc.push((offsets, false));
733                            }
734                        } else {
735                            acc.push((offsets, false));
736                        }
737                        previous_match = is_match;
738                        acc
739                    })
740            }
741            MergedWithNext => {
742                let mut previous_match = false;
743                let mut matches =
744                    matches
745                        .into_iter()
746                        .rev()
747                        .fold(vec![], |mut acc, (offsets, is_match)| {
748                            if is_match && !previous_match {
749                                if let Some(((start, _), _)) = acc.last_mut() {
750                                    *start = offsets.0;
751                                } else {
752                                    acc.push((offsets, false));
753                                }
754                            } else {
755                                acc.push((offsets, false));
756                            }
757                            previous_match = is_match;
758                            acc
759                        });
760                matches.reverse();
761                matches
762            }
763        };
764
765        // Then we split according to the computed splits
766        Ok(splits
767            .into_iter()
768            .filter_map(|(offsets, remove)| {
769                if !remove {
770                    Some(
771                        self.slice(Range::Normalized(offsets.0..offsets.1))
772                            .expect("NormalizedString bad split"),
773                    )
774                } else {
775                    None
776                }
777            })
778            .collect())
779    }
780
781    /// Remove any leading space(s) of the normalized string
782    pub fn lstrip(&mut self) -> &mut Self {
783        self.lrstrip(true, false)
784    }
785
786    /// Remove any trailing space(s) of the normalized string
787    pub fn rstrip(&mut self) -> &mut Self {
788        self.lrstrip(false, true)
789    }
790
791    /// Remove any leading and trailing space(s) of the normalized string
792    pub fn strip(&mut self) -> &mut Self {
793        self.lrstrip(true, true)
794    }
795
796    fn lrstrip(&mut self, left: bool, right: bool) -> &mut Self {
797        let leading_spaces = if left {
798            self.get().chars().take_while(|c| c.is_whitespace()).count()
799        } else {
800            0
801        };
802        let trailing_spaces = if right {
803            self.get()
804                .chars()
805                .rev()
806                .take_while(|c| c.is_whitespace())
807                .count()
808        } else {
809            0
810        };
811
812        if leading_spaces > 0 || trailing_spaces > 0 {
813            let count = self.get().chars().count();
814            let transformation = self
815                .normalized
816                .chars()
817                .enumerate()
818                .filter_map(|(i, c)| {
819                    if i < leading_spaces || i >= count - trailing_spaces {
820                        None
821                    } else if i == self.len() - trailing_spaces - 1 {
822                        Some((c, -(trailing_spaces as isize)))
823                    } else {
824                        Some((c, 0))
825                    }
826                })
827                .collect::<Vec<_>>();
828            self.transform(transformation, leading_spaces);
829        }
830        self
831    }
832
833    /// Returns the length of the normalized string (counting chars not bytes)
834    pub fn len(&self) -> usize {
835        self.normalized.len()
836    }
837
838    /// Returns the length of the original string (counting chars not bytes)
839    pub fn len_original(&self) -> usize {
840        self.original.len()
841    }
842
843    /// Whether empty
844    pub fn is_empty(&self) -> bool {
845        self.normalized.is_empty()
846    }
847
848    /// Recalculate original alignments
849    #[allow(dead_code)]
850    pub(crate) fn alignments_original(&self) -> Vec<(usize, usize)> {
851        // Start, end are in alignments
852        // offset, length are in alignments_original
853        let mut alignments_original = Vec::with_capacity(self.original.len());
854
855        // Eventual gap before first group
856        let start = self.alignments[0].0;
857        if start != 0 {
858            alignments_original.extend(vec![(0, 0); start]);
859        }
860
861        let mut last = (&self.alignments[0].0, &self.alignments[0].1);
862        let mut offset = 0;
863        let mut length = 0;
864        for (start, end) in &self.alignments {
865            if last == (start, end) {
866                // This is the same group
867                length += 1;
868            } else {
869                // This is a new group
870                if start < last.1 {
871                    panic!("We can't have overlapping ranges.");
872                }
873
874                // Add the old group
875                alignments_original.extend(vec![(offset, offset + length); last.1 - last.0]);
876                offset += length;
877                length = 1;
878
879                // Eventual gap between the 2 groups
880                alignments_original.extend(vec![(offset, offset); start - last.1]);
881            }
882
883            last = (start, end);
884        }
885        // Add the last group
886        alignments_original.extend(vec![(offset, offset + length); last.1 - last.0]);
887
888        // Add eventual last gap
889        offset += length;
890        alignments_original.extend(vec![
891            (offset, offset);
892            self.original.len() - alignments_original.len()
893        ]);
894
895        // assert_eq!(alignments_original.len(), self.original.len());
896        alignments_original
897    }
898}
899
900/// Returns the range covered by a slice of alignments
901fn expand_alignments(alignments: &[(usize, usize)]) -> Option<std::ops::Range<usize>> {
902    if alignments.is_empty() {
903        None
904    } else {
905        let start = alignments[0].0;
906        let end = alignments[alignments.len() - 1].1;
907        Some(start..end)
908    }
909}
910
911/// Returns a range of the given string slice, by indexing chars instead of bytes
912pub fn get_range_of<T: RangeBounds<usize>>(s: &str, range: T) -> Option<&str> {
913    let len = s.chars().count();
914    let start = match range.start_bound() {
915        Bound::Unbounded => 0,
916        Bound::Included(i) => *i,
917        Bound::Excluded(i) => *i + 1,
918    };
919    let end = match range.end_bound() {
920        Bound::Unbounded => len,
921        Bound::Included(i) => *i + 1,
922        Bound::Excluded(i) => *i,
923    };
924
925    if start == 0 && end == 0 {
926        Some(&s[0..0])
927    } else if start >= len || end > len || start >= end {
928        None
929    } else {
930        let start_b = s.char_indices().map(|(i, _)| i).nth(start).unwrap_or(0);
931        let end_b = s.char_indices().map(|(i, _)| i).nth(end).unwrap_or(s.len());
932        Some(&s[start_b..end_b])
933    }
934}
935
936/// Convert the given range from bytes to char
937pub fn bytes_to_char(s: &str, range: std::ops::Range<usize>) -> Option<std::ops::Range<usize>> {
938    let (mut start, mut end) = if range == (0..0) {
939        (Some(0), Some(0))
940    } else {
941        (None, None)
942    };
943
944    s.char_indices()
945        .enumerate()
946        .take_while(|(_, (b, _))| *b <= range.end)
947        .filter(|(_, (b, _))| *b >= range.start)
948        .for_each(|(i, (b, c))| {
949            if b == range.start {
950                start = Some(i);
951            }
952            if b == range.end {
953                end = Some(i);
954            }
955            if b + c.len_utf8() == range.end {
956                end = Some(i + 1);
957            }
958        });
959
960    Some(start?..end?)
961}
962
963/// Convert the given range from char to bytes
964pub fn char_to_bytes(s: &str, range: std::ops::Range<usize>) -> Option<std::ops::Range<usize>> {
965    let (mut start, mut end) = if range == (0..0) {
966        (Some(0), Some(0))
967    } else {
968        (None, None)
969    };
970
971    if range.start == range.end {
972        s.char_indices()
973            .skip(range.start)
974            .take(1)
975            .for_each(|(b, _)| {
976                start = Some(b);
977                end = Some(b);
978            });
979    } else {
980        s.char_indices()
981            .skip(range.start)
982            .take(range.end - range.start)
983            .for_each(|(b, c)| {
984                if start.is_none() {
985                    start = Some(b);
986                }
987                end = Some(b + c.len_utf8());
988            });
989    }
990
991    Some(start?..end?)
992}
993
994impl From<String> for NormalizedString {
995    fn from(s: String) -> Self {
996        let alignments = s
997            .char_indices()
998            .flat_map(|(b, c)| {
999                let len = c.len_utf8();
1000                (0..len).map(move |_| (b, b + len))
1001            })
1002            .collect::<Vec<_>>();
1003        Self {
1004            original: s.clone(),
1005            normalized: s,
1006            alignments,
1007            original_shift: 0,
1008        }
1009    }
1010}
1011
1012impl From<&str> for NormalizedString {
1013    fn from(s: &str) -> Self {
1014        Self::from(s.to_owned())
1015    }
1016}
1017
1018#[cfg(test)]
1019mod tests {
1020    use super::*;
1021    use regex::Regex;
1022    use unicode_categories::UnicodeCategories;
1023
1024    #[test]
1025    fn test_len_range_inclusive() {
1026        let range = Range::Original(3..=7);
1027        let len = range.len();
1028        assert_eq!(len, Some(5)); // 7 - 3 + 1 = 5
1029    }
1030
1031    #[test]
1032    fn test_len_range_exclusive() {
1033        let range = Range::Original(3..7);
1034        let len = range.len();
1035        assert_eq!(len, Some(4)); // 7 - 3 = 4
1036    }
1037
1038    #[test]
1039    fn nfd_adds_new_chars() {
1040        let mut n = NormalizedString::from("Γ©lΓ©gant");
1041        n.nfd();
1042        assert_eq!(
1043            &n.alignments,
1044            &[
1045                (0, 2),
1046                (0, 2),
1047                (0, 2),
1048                (2, 3),
1049                (3, 5),
1050                (3, 5),
1051                (3, 5),
1052                (5, 6),
1053                (6, 7),
1054                (7, 8),
1055                (8, 9)
1056            ]
1057        );
1058        assert_eq!(
1059            n.alignments_original(),
1060            vec![
1061                (0, 3),
1062                (0, 3),
1063                (3, 4),
1064                (4, 7),
1065                (4, 7),
1066                (7, 8),
1067                (8, 9),
1068                (9, 10),
1069                (10, 11)
1070            ]
1071        );
1072    }
1073
1074    #[test]
1075    fn remove_chars_added_by_nfd() {
1076        let mut n = NormalizedString::from("Γ©lΓ©gant");
1077        n.nfd().filter(|c| !c.is_mark_nonspacing());
1078
1079        assert_eq!(n.get(), "elegant");
1080
1081        assert_eq!(
1082            &n.alignments,
1083            &[(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9)]
1084        );
1085        assert_eq!(
1086            n.alignments_original(),
1087            vec![
1088                (0, 1),
1089                (0, 1),
1090                (1, 2),
1091                (2, 3),
1092                (2, 3),
1093                (3, 4),
1094                (4, 5),
1095                (5, 6),
1096                (6, 7)
1097            ]
1098        );
1099    }
1100
1101    #[test]
1102    fn remove_chars() {
1103        let mut n = NormalizedString::from("Γ©lΓ©gant");
1104        n.filter(|c| c != 'n');
1105        assert_eq!(n.get(), "Γ©lΓ©gat");
1106        assert_eq!(
1107            &n.alignments,
1108            &[
1109                (0, 2),
1110                (0, 2),
1111                (2, 3),
1112                (3, 5),
1113                (3, 5),
1114                (5, 6),
1115                (6, 7),
1116                // Skipped range
1117                (8, 9)
1118            ]
1119        );
1120        assert_eq!(
1121            n.alignments_original(),
1122            vec![
1123                (0, 2),
1124                (0, 2),
1125                (2, 3),
1126                (3, 5),
1127                (3, 5),
1128                (5, 6),
1129                (6, 7),
1130                (7, 7), // Eaten n
1131                (7, 8)
1132            ]
1133        );
1134    }
1135
1136    #[test]
1137    fn mixed_addition_and_removal() {
1138        let mut n = NormalizedString::from("Γ©lΓ©gant");
1139        n.nfd().filter(|c| !c.is_mark_nonspacing() && c != 'n');
1140        assert_eq!(n.get(), "elegat");
1141        assert_eq!(
1142            &n.alignments,
1143            &[(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (8, 9)]
1144        );
1145        assert_eq!(
1146            n.alignments_original(),
1147            vec![
1148                (0, 1),
1149                (0, 1),
1150                (1, 2),
1151                (2, 3),
1152                (2, 3),
1153                (3, 4), // g
1154                (4, 5), // a
1155                (5, 5), // Eaten n
1156                (5, 6)
1157            ]
1158        );
1159    }
1160
1161    #[test]
1162    fn range_conversion() {
1163        let mut n = NormalizedString::from("    __Hello__   ");
1164        n.filter(|c| !c.is_whitespace()).lowercase();
1165        let hello_n = n.convert_offsets(Range::Original(6..11));
1166        assert_eq!(hello_n, Some(2..7));
1167        assert_eq!(
1168            n.get_range(Range::Normalized(hello_n.clone().unwrap())),
1169            Some("hello")
1170        );
1171        assert_eq!(
1172            n.get_range_original(Range::Normalized(hello_n.unwrap())),
1173            Some("Hello")
1174        );
1175        assert_eq!(n.get_range(Range::Original(6..11)), Some("hello"));
1176        assert_eq!(n.get_range_original(Range::Original(6..11)), Some("Hello"));
1177
1178        // Make sure we get None only in specific cases
1179        assert_eq!(n.convert_offsets(Range::Original(0..0)), Some(0..0));
1180        assert_eq!(n.convert_offsets(Range::Original(3..3)), Some(3..3));
1181        assert_eq!(n.convert_offsets(Range::Original(15..)), Some(9..9));
1182        assert_eq!(n.convert_offsets(Range::Original(16..)), Some(16..16));
1183        assert_eq!(n.convert_offsets(Range::Original(17..)), None);
1184        assert_eq!(n.convert_offsets(Range::Normalized(0..0)), Some(0..0));
1185        assert_eq!(n.convert_offsets(Range::Normalized(3..3)), Some(3..3));
1186        assert_eq!(n.convert_offsets(Range::Normalized(9..)), Some(9..9));
1187        assert_eq!(n.convert_offsets(Range::Normalized(10..)), None);
1188    }
1189
1190    #[test]
1191    fn original_range() {
1192        let mut n = NormalizedString::from("Hello_______ World!");
1193        n.filter(|c| c != '_').lowercase();
1194        let world_n = n.get_range(Range::Normalized(6..11)).unwrap();
1195        let world_o = n.get_range_original(Range::Normalized(6..11)).unwrap();
1196        assert_eq!(world_n, "world");
1197        assert_eq!(world_o, "World");
1198        let original_range = Range::Original(n.convert_offsets(Range::Normalized(6..11)).unwrap());
1199        assert_eq!(n.get_range(original_range.clone()).unwrap(), "world");
1200        assert_eq!(
1201            n.get_range_original(original_range.clone()).unwrap(),
1202            "World"
1203        );
1204        assert_eq!(original_range.into_full_range(n.len_original()), 13..18);
1205    }
1206
1207    #[test]
1208    fn added_around_edges() {
1209        let mut n = NormalizedString::from("Hello");
1210        n.transform(
1211            vec![
1212                (' ', 1),
1213                ('H', 0),
1214                ('e', 0),
1215                ('l', 0),
1216                ('l', 0),
1217                ('o', 0),
1218                (' ', 1),
1219            ],
1220            0,
1221        );
1222
1223        assert_eq!(&n.normalized, " Hello ");
1224        assert_eq!(
1225            n.get_range_original(Range::Normalized(1..n.normalized.len() - 1)),
1226            Some("Hello")
1227        );
1228    }
1229
1230    #[test]
1231    fn added_characters_alignment() {
1232        let mut n = NormalizedString::from("ι‡Žε£ No");
1233        n.transform(
1234            n.get().to_owned().chars().flat_map(|c| {
1235                if (c as usize) > 0x4E00 {
1236                    vec![(' ', 0), (c, 1), (' ', 1)]
1237                } else {
1238                    vec![(c, 0)]
1239                }
1240            }),
1241            0,
1242        );
1243
1244        assert_eq!(
1245            n,
1246            NormalizedString {
1247                original: "ι‡Žε£ No".into(),
1248                normalized: " ι‡Ž  口  No".into(),
1249                alignments: vec![
1250                    (0, 3),
1251                    (0, 3),
1252                    (0, 3),
1253                    (0, 3),
1254                    (0, 3),
1255                    (3, 6),
1256                    (3, 6),
1257                    (3, 6),
1258                    (3, 6),
1259                    (3, 6),
1260                    (6, 7),
1261                    (7, 8),
1262                    (8, 9)
1263                ],
1264                original_shift: 0
1265            }
1266        );
1267        assert_eq!(
1268            n.alignments_original(),
1269            vec![
1270                (0, 5),
1271                (0, 5),
1272                (0, 5),
1273                (5, 10),
1274                (5, 10),
1275                (5, 10),
1276                (10, 11),
1277                (11, 12),
1278                (12, 13)
1279            ]
1280        );
1281    }
1282
1283    #[test]
1284    fn remove_at_beginning() {
1285        let mut n = NormalizedString::from("     Hello");
1286        n.filter(|c| !c.is_whitespace());
1287        assert_eq!(
1288            n.get_range_original(Range::Normalized(1.."Hello".len())),
1289            Some("ello")
1290        );
1291        assert_eq!(
1292            n.get_range_original(Range::Normalized(0..n.normalized.len())),
1293            Some("Hello")
1294        );
1295    }
1296
1297    #[test]
1298    fn remove_at_end() {
1299        let mut n = NormalizedString::from("Hello    ");
1300        n.filter(|c| !c.is_whitespace());
1301        assert_eq!(n.get_range_original(Range::Normalized(0..4)), Some("Hell"));
1302        assert_eq!(
1303            n.get_range_original(Range::Normalized(0..n.normalized.len())),
1304            Some("Hello")
1305        );
1306    }
1307
1308    #[test]
1309    fn removed_around_both_edges() {
1310        let mut n = NormalizedString::from("  Hello  ");
1311        n.filter(|c| !c.is_whitespace());
1312        assert_eq!(&n.normalized, "Hello");
1313
1314        assert_eq!(
1315            n.get_range_original(Range::Normalized(0.."Hello".len())),
1316            Some("Hello")
1317        );
1318        assert_eq!(
1319            n.get_range_original(Range::Normalized(1.."Hell".len())),
1320            Some("ell")
1321        );
1322    }
1323
1324    #[test]
1325    fn lstrip() {
1326        let mut n = NormalizedString::from("  This is an example  ");
1327        n.lstrip();
1328        assert_eq!(&n.normalized, "This is an example  ");
1329        assert_eq!(
1330            n.get_range_original(Range::Normalized(0..n.normalized.len())),
1331            Some("This is an example  ")
1332        );
1333    }
1334
1335    #[test]
1336    fn rstrip() {
1337        let mut n = NormalizedString::from("  This is an example  ");
1338        n.rstrip();
1339        assert_eq!(&n.normalized, "  This is an example");
1340        assert_eq!(
1341            n.get_range_original(Range::Normalized(0..n.normalized.len())),
1342            Some("  This is an example")
1343        );
1344    }
1345
1346    #[test]
1347    fn strip() {
1348        let mut n = NormalizedString::from("  This is an example  ");
1349        n.strip();
1350        assert_eq!(&n.normalized, "This is an example");
1351        assert_eq!(
1352            n.get_range_original(Range::Normalized(0..n.normalized.len())),
1353            Some("This is an example")
1354        );
1355    }
1356
1357    #[test]
1358    fn strip_unicode() {
1359        let mut n = NormalizedString::from("  δ½ ε₯½asa \n");
1360        n.strip();
1361        assert_eq!(&n.normalized, "δ½ ε₯½asa");
1362        assert_eq!(
1363            n.get_range_original(Range::Normalized(0..n.normalized.len())),
1364            Some("δ½ ε₯½asa")
1365        );
1366    }
1367
1368    #[test]
1369    fn prepend() {
1370        let mut n = NormalizedString::from("there");
1371        n.prepend("Hey ");
1372        assert_eq!(&n.normalized, "Hey there");
1373        assert_eq!(
1374            n.alignments,
1375            vec![
1376                (0, 1),
1377                (0, 1),
1378                (0, 1),
1379                (0, 1),
1380                (0, 1),
1381                (1, 2),
1382                (2, 3),
1383                (3, 4),
1384                (4, 5)
1385            ]
1386        );
1387        assert_eq!(n.convert_offsets(Range::Normalized(0..4)), Some(0..1));
1388    }
1389
1390    #[test]
1391    fn append() {
1392        let mut n = NormalizedString::from("Hey");
1393        n.append(" there");
1394        assert_eq!(&n.normalized, "Hey there");
1395        assert_eq!(
1396            n.alignments,
1397            vec![
1398                (0, 1),
1399                (1, 2),
1400                (2, 3),
1401                (2, 3),
1402                (2, 3),
1403                (2, 3),
1404                (2, 3),
1405                (2, 3),
1406                (2, 3)
1407            ]
1408        );
1409        assert_eq!(
1410            n.convert_offsets(Range::Normalized(3.." there".len())),
1411            Some(2..3)
1412        );
1413    }
1414
1415    #[test]
1416    fn get_range() {
1417        let s = String::from("Hello my name is John πŸ‘‹");
1418        assert_eq!(get_range_of(&s, ..), Some(&s[..]));
1419        assert_eq!(get_range_of(&s, 17..), Some("John πŸ‘‹"));
1420    }
1421
1422    #[test]
1423    fn slice() {
1424        let mut s = NormalizedString::from("𝔾𝕠𝕠𝕕 π•žπ• π•£π•Ÿπ•šπ•Ÿπ•˜");
1425        s.nfkc();
1426
1427        let original_slice = s.slice(Range::Original(0..4)).unwrap();
1428        assert_eq!(original_slice.get(), "G");
1429        assert_eq!(original_slice.get_original(), "𝔾");
1430
1431        let normalized_slice = s.slice(Range::Normalized(0..4)).unwrap();
1432        assert_eq!(normalized_slice.get(), "Good");
1433        assert_eq!(normalized_slice.get_original(), "𝔾𝕠𝕠𝕕");
1434
1435        // Make sure the sliced NormalizedString is still aligned as expected
1436        let mut s = NormalizedString::from("   Good Morning!   ");
1437        s.strip();
1438
1439        // If we keep the whole slice
1440        let slice = s.slice(Range::Original(..)).unwrap();
1441        assert_eq!(
1442            slice.get_range_original(Range::Normalized(0..4)),
1443            Some("Good")
1444        );
1445        let slice = s.slice(Range::Normalized(..)).unwrap();
1446        assert_eq!(
1447            slice.get_range_original(Range::Normalized(0..4)),
1448            Some("Good")
1449        );
1450
1451        // If we keep after the modified piece
1452        let slice = s.slice(Range::Original(4..15)).unwrap();
1453        assert_eq!(
1454            slice.get_range_original(Range::Normalized(0..3)),
1455            Some("ood")
1456        );
1457
1458        // If we keep only the modified piece
1459        let slice = s.slice(Range::Original(3..16)).unwrap();
1460        assert_eq!(
1461            slice.get_range_original(Range::Normalized(0..4)),
1462            Some("Good")
1463        );
1464    }
1465
1466    #[test]
1467    fn replace() {
1468        // Simple
1469        let mut s = NormalizedString::from(" Hello   friend ");
1470        s.replace(' ', "_").unwrap();
1471        assert_eq!(s.get(), "_Hello___friend_");
1472        let mut s = NormalizedString::from("aaaab");
1473        s.replace('a', "b").unwrap();
1474        assert_eq!(s.get(), "bbbbb");
1475
1476        // Overlapping
1477        let mut s = NormalizedString::from("aaaab");
1478        s.replace("aaa", "b").unwrap();
1479        assert_eq!(s.get(), "bab");
1480
1481        // Regex
1482        let mut s = NormalizedString::from(" Hello   friend ");
1483        let re = Regex::new(r"\s+").unwrap();
1484        s.replace(&re, "_").unwrap();
1485        assert_eq!(s.get(), "_Hello_friend_");
1486    }
1487
1488    #[test]
1489    fn split() {
1490        use SplitDelimiterBehavior::*;
1491        let s = NormalizedString::from("The-final--countdown");
1492
1493        let test = |behavior: SplitDelimiterBehavior, result: Vec<&str>| {
1494            let splits = s.split('-', behavior).unwrap();
1495            assert_eq!(splits.iter().map(|n| n.get()).collect::<Vec<_>>(), result);
1496        };
1497
1498        test(Removed, vec!["The", "final", "countdown"]);
1499        test(Isolated, vec!["The", "-", "final", "-", "-", "countdown"]);
1500        test(MergedWithPrevious, vec!["The-", "final-", "-", "countdown"]);
1501        test(MergedWithNext, vec!["The", "-final", "-", "-countdown"]);
1502        test(Contiguous, vec!["The", "-", "final", "--", "countdown"]);
1503    }
1504
1505    #[test]
1506    fn transform_range_single_bytes() {
1507        let s = NormalizedString::from("Hello friend");
1508
1509        // Removing at the beginning
1510        let mut current = s.clone();
1511        current.transform_range(Range::Original(0..4), vec![('Y', 0)], 3);
1512        assert_eq!(
1513            current,
1514            NormalizedString {
1515                original: "Hello friend".into(),
1516                normalized: "Yo friend".into(),
1517                alignments: vec![
1518                    (3, 4),
1519                    (4, 5),
1520                    (5, 6),
1521                    (6, 7),
1522                    (7, 8),
1523                    (8, 9),
1524                    (9, 10),
1525                    (10, 11),
1526                    (11, 12)
1527                ],
1528                original_shift: 0,
1529            }
1530        );
1531
1532        assert_eq!(
1533            current.alignments_original(),
1534            vec![
1535                (0, 0),
1536                (0, 0),
1537                (0, 0),
1538                (0, 1),
1539                (1, 2),
1540                (2, 3),
1541                (3, 4),
1542                (4, 5),
1543                (5, 6),
1544                (6, 7),
1545                (7, 8),
1546                (8, 9)
1547            ]
1548        );
1549
1550        // Removing in the middle
1551        let mut current = s.clone();
1552        current.transform_range(
1553            Range::Original(3..10),
1554            vec![('_', 0), ('F', 0), ('R', -2)],
1555            2,
1556        );
1557        assert_eq!(
1558            current,
1559            NormalizedString {
1560                original: "Hello friend".into(),
1561                normalized: "Hel_FRnd".into(),
1562                alignments: vec![
1563                    (0, 1),
1564                    (1, 2),
1565                    (2, 3),
1566                    (5, 6),
1567                    (6, 7),
1568                    (7, 8),
1569                    (10, 11),
1570                    (11, 12)
1571                ],
1572                original_shift: 0,
1573            }
1574        );
1575
1576        assert_eq!(
1577            current.alignments_original(),
1578            vec![
1579                (0, 1),
1580                (1, 2),
1581                (2, 3),
1582                (3, 3),
1583                (3, 3),
1584                (3, 4),
1585                (4, 5),
1586                (5, 6),
1587                (6, 6),
1588                (6, 6),
1589                (6, 7),
1590                (7, 8)
1591            ]
1592        );
1593
1594        // Removing at the end
1595        let mut current = s.clone();
1596        current.transform_range(Range::Original(5..), vec![('_', 0), ('F', -5)], 0);
1597        assert_eq!(
1598            current,
1599            NormalizedString {
1600                original: "Hello friend".into(),
1601                normalized: "Hello_F".into(),
1602                alignments: vec![(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)],
1603                original_shift: 0,
1604            }
1605        );
1606        assert_eq!(
1607            current.alignments_original(),
1608            vec![
1609                (0, 1),
1610                (1, 2),
1611                (2, 3),
1612                (3, 4),
1613                (4, 5),
1614                (5, 6),
1615                (6, 7),
1616                (7, 7),
1617                (7, 7),
1618                (7, 7),
1619                (7, 7),
1620                (7, 7)
1621            ]
1622        );
1623
1624        // Adding at the beginning
1625        let mut current = s.clone();
1626        current.transform_range(Range::Original(0..1), vec![('H', 1), ('H', 0)], 0);
1627        assert_eq!(
1628            current,
1629            NormalizedString {
1630                original: "Hello friend".into(),
1631                normalized: "HHello friend".into(),
1632                alignments: vec![
1633                    (0, 0),
1634                    (0, 1),
1635                    (1, 2),
1636                    (2, 3),
1637                    (3, 4),
1638                    (4, 5),
1639                    (5, 6),
1640                    (6, 7),
1641                    (7, 8),
1642                    (8, 9),
1643                    (9, 10),
1644                    (10, 11),
1645                    (11, 12)
1646                ],
1647                original_shift: 0,
1648            }
1649        );
1650        assert_eq!(
1651            current.alignments_original(),
1652            vec![
1653                (1, 2),
1654                (2, 3),
1655                (3, 4),
1656                (4, 5),
1657                (5, 6),
1658                (6, 7),
1659                (7, 8),
1660                (8, 9),
1661                (9, 10),
1662                (10, 11),
1663                (11, 12),
1664                (12, 13)
1665            ]
1666        );
1667        // Equivalent to the previous one
1668        let mut current = s.clone();
1669        current.transform_range(Range::Original(0..0), vec![('H', 1)], 0);
1670        assert_eq!(
1671            current,
1672            NormalizedString {
1673                original: "Hello friend".into(),
1674                normalized: "HHello friend".into(),
1675                alignments: vec![
1676                    (0, 0),
1677                    (0, 1),
1678                    (1, 2),
1679                    (2, 3),
1680                    (3, 4),
1681                    (4, 5),
1682                    (5, 6),
1683                    (6, 7),
1684                    (7, 8),
1685                    (8, 9),
1686                    (9, 10),
1687                    (10, 11),
1688                    (11, 12)
1689                ],
1690                original_shift: 0,
1691            }
1692        );
1693        assert_eq!(
1694            current.alignments_original(),
1695            vec![
1696                (1, 2),
1697                (2, 3),
1698                (3, 4),
1699                (4, 5),
1700                (5, 6),
1701                (6, 7),
1702                (7, 8),
1703                (8, 9),
1704                (9, 10),
1705                (10, 11),
1706                (11, 12),
1707                (12, 13)
1708            ]
1709        );
1710        // Adding as part of the first character
1711        let mut current = s.clone();
1712        current.transform_range(Range::Original(0..1), vec![('H', 0), ('H', 1)], 0);
1713        assert_eq!(
1714            current,
1715            NormalizedString {
1716                original: "Hello friend".into(),
1717                normalized: "HHello friend".into(),
1718                alignments: vec![
1719                    (0, 1),
1720                    (0, 1),
1721                    (1, 2),
1722                    (2, 3),
1723                    (3, 4),
1724                    (4, 5),
1725                    (5, 6),
1726                    (6, 7),
1727                    (7, 8),
1728                    (8, 9),
1729                    (9, 10),
1730                    (10, 11),
1731                    (11, 12)
1732                ],
1733                original_shift: 0,
1734            }
1735        );
1736
1737        assert_eq!(
1738            current.alignments_original(),
1739            vec![
1740                (0, 2),
1741                (2, 3),
1742                (3, 4),
1743                (4, 5),
1744                (5, 6),
1745                (6, 7),
1746                (7, 8),
1747                (8, 9),
1748                (9, 10),
1749                (10, 11),
1750                (11, 12),
1751                (12, 13)
1752            ]
1753        );
1754
1755        // Adding in the middle
1756        let mut current = s.clone();
1757        current.transform_range(
1758            Range::Original(5..6),
1759            vec![('_', 0), ('m', 1), ('y', 1), ('_', 1)],
1760            0,
1761        );
1762        assert_eq!(
1763            current,
1764            NormalizedString {
1765                original: "Hello friend".into(),
1766                normalized: "Hello_my_friend".into(),
1767                alignments: vec![
1768                    (0, 1),
1769                    (1, 2),
1770                    (2, 3),
1771                    (3, 4),
1772                    (4, 5),
1773                    (5, 6),
1774                    (5, 6),
1775                    (5, 6),
1776                    (5, 6),
1777                    (6, 7),
1778                    (7, 8),
1779                    (8, 9),
1780                    (9, 10),
1781                    (10, 11),
1782                    (11, 12)
1783                ],
1784                original_shift: 0,
1785            }
1786        );
1787        assert_eq!(
1788            current.alignments_original(),
1789            vec![
1790                (0, 1),
1791                (1, 2),
1792                (2, 3),
1793                (3, 4),
1794                (4, 5),
1795                (5, 9),
1796                (9, 10),
1797                (10, 11),
1798                (11, 12),
1799                (12, 13),
1800                (13, 14),
1801                (14, 15)
1802            ]
1803        );
1804
1805        // Adding at the end
1806        let mut current = s;
1807        current.transform_range(Range::Original(11..), vec![('d', 0), ('_', 1), ('!', 1)], 0);
1808        assert_eq!(
1809            current,
1810            NormalizedString {
1811                original: "Hello friend".into(),
1812                normalized: "Hello friend_!".into(),
1813                alignments: vec![
1814                    (0, 1),
1815                    (1, 2),
1816                    (2, 3),
1817                    (3, 4),
1818                    (4, 5),
1819                    (5, 6),
1820                    (6, 7),
1821                    (7, 8),
1822                    (8, 9),
1823                    (9, 10),
1824                    (10, 11),
1825                    (11, 12),
1826                    (11, 12),
1827                    (11, 12)
1828                ],
1829                original_shift: 0,
1830            }
1831        );
1832        assert_eq!(
1833            current.alignments_original(),
1834            vec![
1835                (0, 1),
1836                (1, 2),
1837                (2, 3),
1838                (3, 4),
1839                (4, 5),
1840                (5, 6),
1841                (6, 7),
1842                (7, 8),
1843                (8, 9),
1844                (9, 10),
1845                (10, 11),
1846                (11, 14)
1847            ]
1848        );
1849    }
1850
1851    #[test]
1852    fn transform_range_multiple_bytes() {
1853        let s = NormalizedString::from("𝔾𝕠𝕠𝕕");
1854
1855        // Removing at the beginning
1856        let mut current = s.clone();
1857        current.transform_range(Range::Original(0..8), vec![('G', -1)], 0);
1858        assert_eq!(
1859            current,
1860            NormalizedString {
1861                original: "𝔾𝕠𝕠𝕕".into(),
1862                normalized: "G𝕠𝕕".into(),
1863                alignments: vec![
1864                    (0, 4),
1865                    (8, 12),
1866                    (8, 12),
1867                    (8, 12),
1868                    (8, 12),
1869                    (12, 16),
1870                    (12, 16),
1871                    (12, 16),
1872                    (12, 16)
1873                ],
1874                original_shift: 0,
1875            }
1876        );
1877        assert_eq!(
1878            current.alignments_original(),
1879            vec![
1880                (0, 1),
1881                (0, 1),
1882                (0, 1),
1883                (0, 1),
1884                (1, 1),
1885                (1, 1),
1886                (1, 1),
1887                (1, 1),
1888                (1, 5),
1889                (1, 5),
1890                (1, 5),
1891                (1, 5),
1892                (5, 9),
1893                (5, 9),
1894                (5, 9),
1895                (5, 9)
1896            ]
1897        );
1898        assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "G");
1899        assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "G");
1900        assert_eq!(
1901            current.get_range_original(Range::Original(0..4)).unwrap(),
1902            "𝔾"
1903        );
1904        assert_eq!(
1905            current.get_range_original(Range::Original(0..8)).unwrap(),
1906            "𝔾𝕠"
1907        );
1908
1909        // Removing in the middle
1910        let mut current = s.clone();
1911        current.transform_range(Range::Original(4..12), vec![('o', -1)], 0);
1912        assert_eq!(
1913            current,
1914            NormalizedString {
1915                original: "𝔾𝕠𝕠𝕕".into(),
1916                normalized: "𝔾o𝕕".into(),
1917                alignments: vec![
1918                    (0, 4),
1919                    (0, 4),
1920                    (0, 4),
1921                    (0, 4),
1922                    (4, 8),
1923                    (12, 16),
1924                    (12, 16),
1925                    (12, 16),
1926                    (12, 16)
1927                ],
1928                original_shift: 0,
1929            }
1930        );
1931        assert_eq!(
1932            current.alignments_original(),
1933            vec![
1934                (0, 4),
1935                (0, 4),
1936                (0, 4),
1937                (0, 4),
1938                (4, 5),
1939                (4, 5),
1940                (4, 5),
1941                (4, 5),
1942                (5, 5),
1943                (5, 5),
1944                (5, 5),
1945                (5, 5),
1946                (5, 9),
1947                (5, 9),
1948                (5, 9),
1949                (5, 9)
1950            ]
1951        );
1952
1953        // Removing at the end
1954        let mut current = s.clone();
1955        current.transform_range(Range::Original(12..), vec![('d', 0), ('!', 1)], 0);
1956        assert_eq!(
1957            current,
1958            NormalizedString {
1959                original: "𝔾𝕠𝕠𝕕".into(),
1960                normalized: "𝔾𝕠𝕠d!".into(),
1961                alignments: vec![
1962                    (0, 4),
1963                    (0, 4),
1964                    (0, 4),
1965                    (0, 4),
1966                    (4, 8),
1967                    (4, 8),
1968                    (4, 8),
1969                    (4, 8),
1970                    (8, 12),
1971                    (8, 12),
1972                    (8, 12),
1973                    (8, 12),
1974                    (12, 16),
1975                    (12, 16)
1976                ],
1977                original_shift: 0,
1978            }
1979        );
1980
1981        // Adding at the beginning
1982        let mut current = s.clone();
1983        current.transform_range(Range::Original(0..4), vec![('_', 1), ('𝔾', 0)], 0);
1984        assert_eq!(
1985            current,
1986            NormalizedString {
1987                original: "𝔾𝕠𝕠𝕕".into(),
1988                normalized: "_𝔾𝕠𝕠𝕕".into(),
1989                alignments: vec![
1990                    (0, 0),
1991                    (0, 4),
1992                    (0, 4),
1993                    (0, 4),
1994                    (0, 4),
1995                    (4, 8),
1996                    (4, 8),
1997                    (4, 8),
1998                    (4, 8),
1999                    (8, 12),
2000                    (8, 12),
2001                    (8, 12),
2002                    (8, 12),
2003                    (12, 16),
2004                    (12, 16),
2005                    (12, 16),
2006                    (12, 16)
2007                ],
2008                original_shift: 0,
2009            }
2010        );
2011        assert_eq!(
2012            current.alignments_original(),
2013            vec![
2014                (1, 5),
2015                (1, 5),
2016                (1, 5),
2017                (1, 5),
2018                (5, 9),
2019                (5, 9),
2020                (5, 9),
2021                (5, 9),
2022                (9, 13),
2023                (9, 13),
2024                (9, 13),
2025                (9, 13),
2026                (13, 17),
2027                (13, 17),
2028                (13, 17),
2029                (13, 17)
2030            ]
2031        );
2032
2033        assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "𝔾𝕠");
2034        assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "𝔾");
2035        assert_eq!(
2036            current.get_range_original(Range::Original(0..4)).unwrap(),
2037            "𝔾"
2038        );
2039        assert_eq!(
2040            current.get_range_original(Range::Original(0..8)).unwrap(),
2041            "𝔾𝕠"
2042        );
2043        // Equivalent to the previous one
2044        let mut current = s.clone();
2045        current.transform_range(Range::Original(0..0), vec![('_', 1)], 0);
2046        assert_eq!(
2047            current,
2048            NormalizedString {
2049                original: "𝔾𝕠𝕠𝕕".into(),
2050                normalized: "_𝔾𝕠𝕠𝕕".into(),
2051                alignments: vec![
2052                    (0, 0),
2053                    (0, 4),
2054                    (0, 4),
2055                    (0, 4),
2056                    (0, 4),
2057                    (4, 8),
2058                    (4, 8),
2059                    (4, 8),
2060                    (4, 8),
2061                    (8, 12),
2062                    (8, 12),
2063                    (8, 12),
2064                    (8, 12),
2065                    (12, 16),
2066                    (12, 16),
2067                    (12, 16),
2068                    (12, 16)
2069                ],
2070                original_shift: 0,
2071            }
2072        );
2073        assert_eq!(
2074            current.alignments_original(),
2075            vec![
2076                (1, 5),
2077                (1, 5),
2078                (1, 5),
2079                (1, 5),
2080                (5, 9),
2081                (5, 9),
2082                (5, 9),
2083                (5, 9),
2084                (9, 13),
2085                (9, 13),
2086                (9, 13),
2087                (9, 13),
2088                (13, 17),
2089                (13, 17),
2090                (13, 17),
2091                (13, 17)
2092            ]
2093        );
2094
2095        assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "𝔾𝕠");
2096        assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "𝔾");
2097        assert_eq!(
2098            current.get_range_original(Range::Original(0..4)).unwrap(),
2099            "𝔾"
2100        );
2101        assert_eq!(
2102            current.get_range_original(Range::Original(0..8)).unwrap(),
2103            "𝔾𝕠"
2104        );
2105        // Adding as part of the first character
2106        let mut current = s.clone();
2107        current.transform_range(Range::Original(0..4), vec![('𝔾', 0), ('o', 1)], 0);
2108        assert_eq!(
2109            current,
2110            NormalizedString {
2111                original: "𝔾𝕠𝕠𝕕".into(),
2112                normalized: "𝔾o𝕠𝕠𝕕".into(),
2113                alignments: vec![
2114                    (0, 4),
2115                    (0, 4),
2116                    (0, 4),
2117                    (0, 4),
2118                    (0, 4),
2119                    (4, 8),
2120                    (4, 8),
2121                    (4, 8),
2122                    (4, 8),
2123                    (8, 12),
2124                    (8, 12),
2125                    (8, 12),
2126                    (8, 12),
2127                    (12, 16),
2128                    (12, 16),
2129                    (12, 16),
2130                    (12, 16)
2131                ],
2132                original_shift: 0,
2133            }
2134        );
2135        assert_eq!(
2136            current.alignments_original(),
2137            vec![
2138                (0, 5),
2139                (0, 5),
2140                (0, 5),
2141                (0, 5),
2142                (5, 9),
2143                (5, 9),
2144                (5, 9),
2145                (5, 9),
2146                (9, 13),
2147                (9, 13),
2148                (9, 13),
2149                (9, 13),
2150                (13, 17),
2151                (13, 17),
2152                (13, 17),
2153                (13, 17)
2154            ]
2155        );
2156        assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "𝔾o𝕠");
2157        assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "𝔾o");
2158        assert_eq!(
2159            current.get_range_original(Range::Original(0..4)).unwrap(),
2160            "𝔾"
2161        );
2162        assert_eq!(
2163            current.get_range_original(Range::Original(0..8)).unwrap(),
2164            "𝔾𝕠"
2165        );
2166
2167        // Adding in the middle
2168        let mut current = s.clone();
2169        current.transform_range(
2170            Range::Original(4..8),
2171            vec![('𝕠', 0), ('o', 1), ('o', 1), ('o', 1)],
2172            0,
2173        );
2174        assert_eq!(
2175            current,
2176            NormalizedString {
2177                original: "𝔾𝕠𝕠𝕕".into(),
2178                normalized: "𝔾𝕠ooo𝕠𝕕".into(),
2179                alignments: vec![
2180                    (0, 4),
2181                    (0, 4),
2182                    (0, 4),
2183                    (0, 4),
2184                    (4, 8),
2185                    (4, 8),
2186                    (4, 8),
2187                    (4, 8),
2188                    (4, 8),
2189                    (4, 8),
2190                    (4, 8),
2191                    (8, 12),
2192                    (8, 12),
2193                    (8, 12),
2194                    (8, 12),
2195                    (12, 16),
2196                    (12, 16),
2197                    (12, 16),
2198                    (12, 16)
2199                ],
2200                original_shift: 0,
2201            }
2202        );
2203        assert_eq!(
2204            current.alignments_original(),
2205            vec![
2206                (0, 4),
2207                (0, 4),
2208                (0, 4),
2209                (0, 4),
2210                (4, 11),
2211                (4, 11),
2212                (4, 11),
2213                (4, 11),
2214                (11, 15),
2215                (11, 15),
2216                (11, 15),
2217                (11, 15),
2218                (15, 19),
2219                (15, 19),
2220                (15, 19),
2221                (15, 19)
2222            ]
2223        );
2224
2225        // Adding at the end
2226        let mut current = s;
2227        current.transform_range(Range::Original(16..), vec![('!', 1)], 0);
2228        assert_eq!(
2229            current,
2230            NormalizedString {
2231                original: "𝔾𝕠𝕠𝕕".into(),
2232                normalized: "𝔾𝕠𝕠𝕕!".into(),
2233                alignments: vec![
2234                    (0, 4),
2235                    (0, 4),
2236                    (0, 4),
2237                    (0, 4),
2238                    (4, 8),
2239                    (4, 8),
2240                    (4, 8),
2241                    (4, 8),
2242                    (8, 12),
2243                    (8, 12),
2244                    (8, 12),
2245                    (8, 12),
2246                    (12, 16),
2247                    (12, 16),
2248                    (12, 16),
2249                    (12, 16),
2250                    (12, 16)
2251                ],
2252                original_shift: 0,
2253            }
2254        );
2255        assert_eq!(
2256            current.alignments_original(),
2257            vec![
2258                (0, 4),
2259                (0, 4),
2260                (0, 4),
2261                (0, 4),
2262                (4, 8),
2263                (4, 8),
2264                (4, 8),
2265                (4, 8),
2266                (8, 12),
2267                (8, 12),
2268                (8, 12),
2269                (8, 12),
2270                (12, 17),
2271                (12, 17),
2272                (12, 17),
2273                (12, 17)
2274            ]
2275        );
2276    }
2277
2278    #[test]
2279    fn transform_check() {
2280        let mut s = NormalizedString::from("abc…");
2281        s.nfkd();
2282        let transforms = vec![('a', -2), ('.', 0), ('.', 0), ('.', 0)];
2283        s.transform(transforms, 0);
2284        s.lowercase();
2285        assert_eq!(s.get(), "a...");
2286    }
2287}