use crate::pattern::Pattern;
use crate::{Offsets, Result};
use std::ops::{Bound, RangeBounds};
use unicode_normalization_alignments::UnicodeNormalization;
use serde::{Deserialize, Serialize};
macro_rules! apply_signed {
($origin: expr, $signed: expr) => {
if $signed.is_positive() {
$origin += $signed as usize;
} else {
let (result, overflow) = $origin.overflowing_sub(-($signed) as usize);
$origin = if overflow { 0 } else { result };
}
};
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OffsetReferential {
Original,
Normalized,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Range<T: RangeBounds<usize> + Clone> {
Original(T),
Normalized(T),
}
#[allow(clippy::len_without_is_empty)]
impl<T> Range<T>
where
T: RangeBounds<usize> + Clone,
{
pub fn unwrap(self) -> T {
match self {
Self::Original(r) => r,
Self::Normalized(r) => r,
}
}
pub fn len(&self) -> Option<usize> {
let range = self.clone().unwrap();
let end = match range.end_bound() {
Bound::Unbounded => None,
Bound::Included(i) => Some(*i + 1),
Bound::Excluded(i) => Some(*i),
}?;
match range.start_bound() {
Bound::Unbounded => Some(end),
Bound::Included(i) => Some(end - (*i + 1)),
Bound::Excluded(i) => Some(end - *i),
}
}
pub fn into_full_range(self, max_len: usize) -> std::ops::Range<usize> {
let range = self.unwrap();
let start = match range.start_bound() {
Bound::Unbounded => 0,
Bound::Included(i) => *i,
Bound::Excluded(i) => *i + 1,
};
let end = match range.end_bound() {
Bound::Unbounded => max_len,
Bound::Included(i) => *i + 1,
Bound::Excluded(i) => *i,
};
start..end
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum SplitDelimiterBehavior {
Removed,
Isolated,
MergedWithPrevious,
MergedWithNext,
Contiguous,
}
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct NormalizedString {
original: String,
normalized: String,
alignments: Vec<(usize, usize)>,
original_shift: usize,
}
impl NormalizedString {
#[cfg(test)]
pub(crate) fn new(
original: String,
normalized: String,
alignments: Vec<(usize, usize)>,
original_shift: usize,
) -> Self {
Self {
original,
normalized,
alignments,
original_shift,
}
}
pub fn get(&self) -> &str {
&self.normalized
}
pub fn get_original(&self) -> &str {
&self.original
}
pub fn offsets_original(&self) -> Offsets {
(
self.original_shift,
self.original_shift + self.len_original(),
)
}
pub fn convert_offsets<T>(&self, range: Range<T>) -> Option<std::ops::Range<usize>>
where
T: RangeBounds<usize> + Clone,
{
let len_original = self.len_original();
let len_normalized = self.len();
let (target, original) = match range {
Range::Original(_) => (range.into_full_range(len_original), true),
Range::Normalized(_) => (range.into_full_range(len_normalized), false),
};
if target.start == target.end {
return Some(target);
}
if target.start > target.end {
return None;
}
if original && self.original.is_empty() && target == (0..0) {
return Some(0..len_normalized);
}
if !original && self.normalized.is_empty() && target == (0..0) {
return Some(0..len_original);
}
if original {
let (mut start, mut end) = (None, None);
self.alignments
.iter()
.enumerate()
.take_while(|(_, alignment)| target.end >= alignment.1)
.for_each(|(i, alignment)| {
if start.is_none() && target.start <= alignment.0 {
if alignment.0 != alignment.1 {
start = Some(i);
}
}
if target.end >= alignment.1 {
end = Some(i + 1);
}
});
match (start, end) {
(Some(s), None) => Some(s..s),
(None, Some(e)) => Some(e..e),
(Some(s), Some(e)) => Some(s..e),
_ => None,
}
} else {
self.alignments.get(target).and_then(expand_alignments)
}
}
pub fn get_range<T>(&self, range: Range<T>) -> Option<&str>
where
T: RangeBounds<usize> + Clone,
{
match range {
Range::Original(_) => self.normalized.get(self.convert_offsets(range)?),
Range::Normalized(_) => self.normalized.get(range.into_full_range(self.len())),
}
}
pub fn get_range_original<T>(&self, range: Range<T>) -> Option<&str>
where
T: RangeBounds<usize> + Clone,
{
match range {
Range::Original(_) => self
.original
.get(range.into_full_range(self.len_original())),
Range::Normalized(_) => self.original.get(self.convert_offsets(range)?),
}
}
fn validate_range<T: RangeBounds<usize> + Clone>(
&self,
range: Range<T>,
) -> Option<Range<std::ops::Range<usize>>> {
match range {
Range::Original(_) => {
let r = range.into_full_range(self.original.len());
if !(self.original.is_char_boundary(r.start)
&& self.original.is_char_boundary(r.end))
{
None
} else {
Some(Range::Original(r))
}
}
Range::Normalized(_) => {
let r = range.into_full_range(self.normalized.len());
if !(self.normalized.is_char_boundary(r.start)
&& self.normalized.is_char_boundary(r.end))
{
None
} else {
Some(Range::Normalized(r))
}
}
}
}
pub fn slice<T>(&self, range: Range<T>) -> Option<NormalizedString>
where
T: RangeBounds<usize> + Clone,
{
let full_range = self.validate_range(range)?;
let (normalized_range, original_range) = match full_range {
Range::Original(_) => (
self.convert_offsets(full_range.clone())?,
full_range.clone().unwrap(),
),
Range::Normalized(_) => (
full_range.clone().unwrap(),
self.convert_offsets(full_range.clone())?,
),
};
let n_shift = original_range.start;
Some(Self {
original: self
.get_range_original(full_range.clone())
.unwrap_or_default()
.into(),
normalized: self.get_range(full_range).unwrap_or_default().into(),
alignments: self
.alignments
.get(normalized_range)?
.to_vec()
.iter()
.map(|(start, end)| (start - n_shift, end - n_shift))
.collect(),
original_shift: self.original_shift + original_range.start,
})
}
pub fn transform_range<T, I>(&mut self, range: Range<T>, dest: I, initial_offset: usize)
where
T: RangeBounds<usize> + Clone,
I: IntoIterator<Item = (char, isize)>,
{
let n_range = match range {
Range::Normalized(_) => range.into_full_range(self.len()),
Range::Original(_) => match self.convert_offsets(range) {
Some(range) => range,
None => return,
},
};
trace!(
"===== transform_range call with {:?} (initial_offset: {}) =====",
n_range,
initial_offset
);
let mut replaced_normalized = self.normalized[n_range.clone()]
.chars()
.collect::<Vec<_>>()
.into_iter();
let initial_removed: usize = (&mut replaced_normalized)
.take(initial_offset)
.map(|c| c.len_utf8())
.sum();
let mut offset = (initial_removed + n_range.start) as isize;
let mut alignments = Vec::with_capacity(n_range.len());
trace!("=> Applying transformations");
let normalized = dest
.into_iter()
.map(|(c, changes)| {
trace!(
"### {:?} with size {}: {} with offset {} ###",
c,
c.len_utf8(),
match changes {
0 => "Replacing".into(),
ch if ch > 0 => "Adding".into(),
ch if ch < 0 => format!("Replacing + removing {} following chars", ch),
_ => "Undefined".into(),
},
offset
);
let idx = offset as usize;
let align = if changes.is_positive() {
if idx < 1 {
(0, 0)
} else {
self.alignments[idx - 1]
}
} else {
self.alignments[idx]
};
let replaced_char = if !changes.is_positive() {
replaced_normalized.next()
} else {
None
};
let replaced_char_size = replaced_char.map_or(0, |c| c.len_utf8());
let replaced_char_size_change = c.len_utf8() as isize - replaced_char_size as isize;
if let Some(ref replaced_char) = replaced_char {
trace!(
"Replacing char {:?} - with a change in size: {}",
replaced_char,
replaced_char_size_change
);
}
let total_bytes_to_remove = if changes.is_negative() {
(&mut replaced_normalized)
.take(-changes as usize)
.map(|c| c.len_utf8())
.sum()
} else {
0
};
trace!("Total bytes to remove: {}", total_bytes_to_remove);
offset += replaced_char_size as isize;
offset += total_bytes_to_remove as isize;
trace!("New offset: {}", offset);
trace!("New normalized alignment: {}x {:?}", c.len_utf8(), align);
alignments.extend((0..c.len_utf8()).map(|_| align));
c
})
.collect::<String>();
self.alignments.splice(n_range.clone(), alignments);
unsafe {
self.normalized
.as_mut_vec()
.splice(n_range, normalized.bytes());
}
}
pub fn transform<I>(&mut self, dest: I, initial_offset: usize)
where
I: IntoIterator<Item = (char, isize)>,
{
self.transform_range(Range::Original(..), dest, initial_offset)
}
pub fn nfd(&mut self) -> &mut Self {
self.transform(self.get().to_owned().nfd(), 0);
self
}
pub fn nfkd(&mut self) -> &mut Self {
self.transform(self.get().to_owned().nfkd(), 0);
self
}
pub fn nfc(&mut self) -> &mut Self {
self.transform(self.get().to_owned().nfc(), 0);
self
}
pub fn nfkc(&mut self) -> &mut Self {
self.transform(self.get().to_owned().nfkc(), 0);
self
}
pub fn filter<F: Fn(char) -> bool>(&mut self, keep: F) -> &mut Self {
let mut removed: isize = 0;
let mut removed_start: usize = 0;
let mut transforms = Vec::with_capacity(self.normalized.len());
let mut last_c = None;
for c in self.normalized.chars() {
if keep(c) {
match last_c {
Some(lc) => {
transforms.push((lc, -removed));
}
None => {
removed_start = removed as usize;
}
}
last_c = Some(c);
removed = 0;
} else {
removed += 1;
}
}
if let Some(lc) = last_c {
transforms.push((lc, -removed));
}
self.transform(transforms, removed_start);
self
}
pub fn prepend(&mut self, s: &str) -> &mut Self {
if let Some(next) = self.normalized.chars().next() {
let transformations = s
.chars()
.enumerate()
.map(|(i, c)| (c, if i == 0 { 0 } else { 1 }))
.chain(std::iter::once((next, 1)));
self.transform_range(Range::Normalized(0..next.len_utf8()), transformations, 0);
}
self
}
pub fn append(&mut self, s: &str) -> &mut Self {
if let Some((b, prev)) = self.normalized.char_indices().last() {
let transformations = std::iter::once((prev, 0)).chain(s.chars().map(|c| (c, 1)));
self.transform_range(Range::Normalized(b..), transformations, 0);
}
self
}
pub fn map<F: Fn(char) -> char>(&mut self, map: F) -> &mut Self {
let transformations = self
.normalized
.chars()
.map(|c| (map(c), 0))
.collect::<Vec<_>>();
self.transform(transformations, 0);
self
}
pub fn for_each<F: FnMut(char)>(&self, foreach: F) -> &Self {
self.normalized.chars().for_each(foreach);
self
}
pub fn lowercase(&mut self) -> &mut Self {
let mut new_chars: Vec<(char, isize)> = vec![];
self.for_each(|c| {
c.to_lowercase().enumerate().for_each(|(index, c)| {
new_chars.push((c, isize::from(index > 0)));
})
});
self.transform(new_chars.into_iter(), 0);
self
}
pub fn uppercase(&mut self) -> &mut Self {
let mut new_chars: Vec<(char, isize)> = vec![];
self.for_each(|c| {
c.to_uppercase().enumerate().for_each(|(index, c)| {
new_chars.push((c, isize::from(index > 0)));
})
});
self.transform(new_chars.into_iter(), 0);
self
}
pub fn replace<P: Pattern>(&mut self, pattern: P, content: &str) -> Result<()> {
let mut offset: isize = 0;
pattern
.find_matches(&self.normalized)?
.into_iter()
.for_each(|((start, end), is_match)| {
if is_match {
let mut range = start..end;
apply_signed!(range.start, offset);
apply_signed!(range.end, offset);
let mut new_len = 0;
let removed_chars = self.normalized[range.clone()].chars().count();
self.transform_range(
Range::Normalized(range),
content.chars().map(|c| {
new_len += c.len_utf8();
(c, 1)
}),
removed_chars,
);
let old_len = end - start;
offset += new_len as isize - old_len as isize;
}
});
Ok(())
}
pub fn clear(&mut self) -> usize {
let len = self.len();
self.transform(std::iter::empty(), len);
len
}
pub fn split<P: Pattern>(
&self,
pattern: P,
behavior: SplitDelimiterBehavior,
) -> Result<Vec<NormalizedString>> {
let matches = pattern.find_matches(&self.normalized)?;
use SplitDelimiterBehavior::*;
let splits = match behavior {
Isolated => matches
.into_iter()
.map(|(offsets, _)| (offsets, false))
.collect(),
Removed => matches,
Contiguous => {
let mut previous_match = false;
matches
.into_iter()
.fold(vec![], |mut acc, (offsets, is_match)| {
if is_match == previous_match {
if let Some(((_, end), _)) = acc.last_mut() {
*end = offsets.1;
} else {
acc.push((offsets, false));
}
} else {
acc.push((offsets, false));
}
previous_match = is_match;
acc
})
}
MergedWithPrevious => {
let mut previous_match = false;
matches
.into_iter()
.fold(vec![], |mut acc, (offsets, is_match)| {
if is_match && !previous_match {
if let Some(((_, end), _)) = acc.last_mut() {
*end = offsets.1;
} else {
acc.push((offsets, false));
}
} else {
acc.push((offsets, false));
}
previous_match = is_match;
acc
})
}
MergedWithNext => {
let mut previous_match = false;
let mut matches =
matches
.into_iter()
.rev()
.fold(vec![], |mut acc, (offsets, is_match)| {
if is_match && !previous_match {
if let Some(((start, _), _)) = acc.last_mut() {
*start = offsets.0;
} else {
acc.push((offsets, false));
}
} else {
acc.push((offsets, false));
}
previous_match = is_match;
acc
});
matches.reverse();
matches
}
};
Ok(splits
.into_iter()
.filter_map(|(offsets, remove)| {
if !remove {
Some(
self.slice(Range::Normalized(offsets.0..offsets.1))
.expect("NormalizedString bad split"),
)
} else {
None
}
})
.collect())
}
pub fn lstrip(&mut self) -> &mut Self {
self.lrstrip(true, false)
}
pub fn rstrip(&mut self) -> &mut Self {
self.lrstrip(false, true)
}
pub fn strip(&mut self) -> &mut Self {
self.lrstrip(true, true)
}
fn lrstrip(&mut self, left: bool, right: bool) -> &mut Self {
let leading_spaces = if left {
self.get().chars().take_while(|c| c.is_whitespace()).count()
} else {
0
};
let trailing_spaces = if right {
self.get()
.chars()
.rev()
.take_while(|c| c.is_whitespace())
.count()
} else {
0
};
if leading_spaces > 0 || trailing_spaces > 0 {
let count = self.get().chars().count();
let transformation = self
.normalized
.chars()
.enumerate()
.filter_map(|(i, c)| {
if i < leading_spaces || i >= count - trailing_spaces {
None
} else if i == self.len() - trailing_spaces - 1 {
Some((c, -(trailing_spaces as isize)))
} else {
Some((c, 0))
}
})
.collect::<Vec<_>>();
self.transform(transformation, leading_spaces);
}
self
}
pub fn len(&self) -> usize {
self.normalized.len()
}
pub fn len_original(&self) -> usize {
self.original.len()
}
pub fn is_empty(&self) -> bool {
self.normalized.is_empty()
}
#[allow(dead_code)]
pub(crate) fn alignments_original(&self) -> Vec<(usize, usize)> {
let mut alignments_original = Vec::with_capacity(self.original.len());
let start = self.alignments[0].0;
if start != 0 {
alignments_original.extend(vec![(0, 0); start]);
}
let mut last = (&self.alignments[0].0, &self.alignments[0].1);
let mut offset = 0;
let mut length = 0;
for (start, end) in &self.alignments {
if last == (start, end) {
length += 1;
} else {
if start < last.1 {
panic!("We can't have overlapping ranges.");
}
alignments_original.extend(vec![(offset, offset + length); last.1 - last.0]);
offset += length;
length = 1;
alignments_original.extend(vec![(offset, offset); start - last.1]);
}
last = (start, end);
}
alignments_original.extend(vec![(offset, offset + length); last.1 - last.0]);
offset += length;
alignments_original.extend(vec![
(offset, offset);
self.original.len() - alignments_original.len()
]);
alignments_original
}
}
fn expand_alignments(alignments: &[(usize, usize)]) -> Option<std::ops::Range<usize>> {
if alignments.is_empty() {
None
} else {
let start = alignments[0].0;
let end = alignments[alignments.len() - 1].1;
Some(start..end)
}
}
pub fn get_range_of<T: RangeBounds<usize>>(s: &str, range: T) -> Option<&str> {
let len = s.chars().count();
let start = match range.start_bound() {
Bound::Unbounded => 0,
Bound::Included(i) => *i,
Bound::Excluded(i) => *i + 1,
};
let end = match range.end_bound() {
Bound::Unbounded => len,
Bound::Included(i) => *i + 1,
Bound::Excluded(i) => *i,
};
if start == 0 && end == 0 {
Some(&s[0..0])
} else if start >= len || end > len || start >= end {
None
} else {
let start_b = s
.char_indices()
.map(|(i, _)| i)
.nth(start as usize)
.unwrap_or(0);
let end_b = s
.char_indices()
.map(|(i, _)| i)
.nth(end as usize)
.unwrap_or(s.len());
Some(&s[start_b..end_b])
}
}
pub fn bytes_to_char(s: &str, range: std::ops::Range<usize>) -> Option<std::ops::Range<usize>> {
let (mut start, mut end) = if range == (0..0) {
(Some(0), Some(0))
} else {
(None, None)
};
s.char_indices()
.enumerate()
.take_while(|(_, (b, _))| *b <= range.end)
.filter(|(_, (b, _))| *b >= range.start)
.for_each(|(i, (b, c))| {
if b == range.start {
start = Some(i);
}
if b == range.end {
end = Some(i);
}
if b + c.len_utf8() == range.end {
end = Some(i + 1);
}
});
Some(start?..end?)
}
pub fn char_to_bytes(s: &str, range: std::ops::Range<usize>) -> Option<std::ops::Range<usize>> {
let (mut start, mut end) = if range == (0..0) {
(Some(0), Some(0))
} else {
(None, None)
};
if range.start == range.end {
s.char_indices()
.skip(range.start)
.take(1)
.for_each(|(b, _)| {
start = Some(b);
end = Some(b);
});
} else {
s.char_indices()
.skip(range.start)
.take(range.end - range.start)
.for_each(|(b, c)| {
if start.is_none() {
start = Some(b);
}
end = Some(b + c.len_utf8());
});
}
Some(start?..end?)
}
impl From<String> for NormalizedString {
fn from(s: String) -> Self {
let alignments = s
.char_indices()
.flat_map(|(b, c)| {
let len = c.len_utf8();
(0..len).map(move |_| (b, b + len))
})
.collect::<Vec<_>>();
Self {
original: s.clone(),
normalized: s,
alignments,
original_shift: 0,
}
}
}
impl From<&str> for NormalizedString {
fn from(s: &str) -> Self {
Self::from(s.to_owned())
}
}
#[cfg(test)]
mod tests {
use super::*;
use regex::Regex;
use unicode_categories::UnicodeCategories;
#[test]
fn nfd_adds_new_chars() {
let mut n = NormalizedString::from("Γ©lΓ©gant");
n.nfd();
assert_eq!(
&n.alignments,
&[
(0, 2),
(0, 2),
(0, 2),
(2, 3),
(3, 5),
(3, 5),
(3, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9)
]
);
assert_eq!(
n.alignments_original(),
vec![
(0, 3),
(0, 3),
(3, 4),
(4, 7),
(4, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11)
]
);
}
#[test]
fn remove_chars_added_by_nfd() {
let mut n = NormalizedString::from("Γ©lΓ©gant");
n.nfd().filter(|c| !c.is_mark_nonspacing());
assert_eq!(n.get(), "elegant");
assert_eq!(
&n.alignments,
&[(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9)]
);
assert_eq!(
n.alignments_original(),
vec![
(0, 1),
(0, 1),
(1, 2),
(2, 3),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7)
]
);
}
#[test]
fn remove_chars() {
let mut n = NormalizedString::from("Γ©lΓ©gant");
n.filter(|c| c != 'n');
assert_eq!(n.get(), "Γ©lΓ©gat");
assert_eq!(
&n.alignments,
&[
(0, 2),
(0, 2),
(2, 3),
(3, 5),
(3, 5),
(5, 6),
(6, 7),
(8, 9)
]
);
assert_eq!(
n.alignments_original(),
vec![
(0, 2),
(0, 2),
(2, 3),
(3, 5),
(3, 5),
(5, 6),
(6, 7),
(7, 7), (7, 8)
]
);
}
#[test]
fn mixed_addition_and_removal() {
let mut n = NormalizedString::from("Γ©lΓ©gant");
n.nfd().filter(|c| !c.is_mark_nonspacing() && c != 'n');
assert_eq!(n.get(), "elegat");
assert_eq!(
&n.alignments,
&[(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (8, 9)]
);
assert_eq!(
n.alignments_original(),
vec![
(0, 1),
(0, 1),
(1, 2),
(2, 3),
(2, 3),
(3, 4), (4, 5), (5, 5), (5, 6)
]
);
}
#[test]
fn range_conversion() {
let mut n = NormalizedString::from(" __Hello__ ");
n.filter(|c| !c.is_whitespace()).lowercase();
let hello_n = n.convert_offsets(Range::Original(6..11));
assert_eq!(hello_n, Some(2..7));
assert_eq!(
n.get_range(Range::Normalized(hello_n.clone().unwrap())),
Some("hello")
);
assert_eq!(
n.get_range_original(Range::Normalized(hello_n.unwrap())),
Some("Hello")
);
assert_eq!(n.get_range(Range::Original(6..11)), Some("hello"));
assert_eq!(n.get_range_original(Range::Original(6..11)), Some("Hello"));
assert_eq!(n.convert_offsets(Range::Original(0..0)), Some(0..0));
assert_eq!(n.convert_offsets(Range::Original(3..3)), Some(3..3));
assert_eq!(n.convert_offsets(Range::Original(15..)), Some(9..9));
assert_eq!(n.convert_offsets(Range::Original(16..)), Some(16..16));
assert_eq!(n.convert_offsets(Range::Original(17..)), None);
assert_eq!(n.convert_offsets(Range::Normalized(0..0)), Some(0..0));
assert_eq!(n.convert_offsets(Range::Normalized(3..3)), Some(3..3));
assert_eq!(n.convert_offsets(Range::Normalized(9..)), Some(9..9));
assert_eq!(n.convert_offsets(Range::Normalized(10..)), None);
}
#[test]
fn original_range() {
let mut n = NormalizedString::from("Hello_______ World!");
n.filter(|c| c != '_').lowercase();
let world_n = n.get_range(Range::Normalized(6..11)).unwrap();
let world_o = n.get_range_original(Range::Normalized(6..11)).unwrap();
assert_eq!(world_n, "world");
assert_eq!(world_o, "World");
let original_range = Range::Original(n.convert_offsets(Range::Normalized(6..11)).unwrap());
assert_eq!(n.get_range(original_range.clone()).unwrap(), "world");
assert_eq!(
n.get_range_original(original_range.clone()).unwrap(),
"World"
);
assert_eq!(original_range.into_full_range(n.len_original()), 13..18);
}
#[test]
fn added_around_edges() {
let mut n = NormalizedString::from("Hello");
n.transform(
vec![
(' ', 1),
('H', 0),
('e', 0),
('l', 0),
('l', 0),
('o', 0),
(' ', 1),
]
.into_iter(),
0,
);
assert_eq!(&n.normalized, " Hello ");
assert_eq!(
n.get_range_original(Range::Normalized(1..n.normalized.len() - 1)),
Some("Hello")
);
}
#[test]
fn added_characters_alignment() {
let mut n = NormalizedString::from("ιε£ No");
n.transform(
n.get().to_owned().chars().flat_map(|c| {
if (c as usize) > 0x4E00 {
vec![(' ', 0), (c, 1), (' ', 1)]
} else {
vec![(c, 0)]
}
}),
0,
);
assert_eq!(
n,
NormalizedString {
original: "ιε£ No".into(),
normalized: " ι ε£ No".into(),
alignments: vec![
(0, 3),
(0, 3),
(0, 3),
(0, 3),
(0, 3),
(3, 6),
(3, 6),
(3, 6),
(3, 6),
(3, 6),
(6, 7),
(7, 8),
(8, 9)
],
original_shift: 0
}
);
assert_eq!(
n.alignments_original(),
vec![
(0, 5),
(0, 5),
(0, 5),
(5, 10),
(5, 10),
(5, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
}
#[test]
fn remove_at_beginning() {
let mut n = NormalizedString::from(" Hello");
n.filter(|c| !c.is_whitespace());
assert_eq!(
n.get_range_original(Range::Normalized(1.."Hello".len())),
Some("ello")
);
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some("Hello")
);
}
#[test]
fn remove_at_end() {
let mut n = NormalizedString::from("Hello ");
n.filter(|c| !c.is_whitespace());
assert_eq!(n.get_range_original(Range::Normalized(0..4)), Some("Hell"));
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some("Hello")
);
}
#[test]
fn removed_around_both_edges() {
let mut n = NormalizedString::from(" Hello ");
n.filter(|c| !c.is_whitespace());
assert_eq!(&n.normalized, "Hello");
assert_eq!(
n.get_range_original(Range::Normalized(0.."Hello".len())),
Some("Hello")
);
assert_eq!(
n.get_range_original(Range::Normalized(1.."Hell".len())),
Some("ell")
);
}
#[test]
fn lstrip() {
let mut n = NormalizedString::from(" This is an example ");
n.lstrip();
assert_eq!(&n.normalized, "This is an example ");
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some("This is an example ")
);
}
#[test]
fn rstrip() {
let mut n = NormalizedString::from(" This is an example ");
n.rstrip();
assert_eq!(&n.normalized, " This is an example");
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some(" This is an example")
);
}
#[test]
fn strip() {
let mut n = NormalizedString::from(" This is an example ");
n.strip();
assert_eq!(&n.normalized, "This is an example");
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some("This is an example")
);
}
#[test]
fn strip_unicode() {
let mut n = NormalizedString::from(" δ½ ε₯½asa \n");
n.strip();
assert_eq!(&n.normalized, "δ½ ε₯½asa");
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some("δ½ ε₯½asa")
);
}
#[test]
fn prepend() {
let mut n = NormalizedString::from("there");
n.prepend("Hey ");
assert_eq!(&n.normalized, "Hey there");
assert_eq!(
n.alignments,
vec![
(0, 1),
(0, 1),
(0, 1),
(0, 1),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5)
]
);
assert_eq!(n.convert_offsets(Range::Normalized(0..4)), Some(0..1));
}
#[test]
fn append() {
let mut n = NormalizedString::from("Hey");
n.append(" there");
assert_eq!(&n.normalized, "Hey there");
assert_eq!(
n.alignments,
vec![
(0, 1),
(1, 2),
(2, 3),
(2, 3),
(2, 3),
(2, 3),
(2, 3),
(2, 3),
(2, 3)
]
);
assert_eq!(
n.convert_offsets(Range::Normalized(3.." there".len())),
Some(2..3)
);
}
#[test]
fn get_range() {
let s = String::from("Hello my name is John π");
assert_eq!(get_range_of(&s, ..), Some(&s[..]));
assert_eq!(get_range_of(&s, 17..), Some("John π"));
}
#[test]
fn slice() {
let mut s = NormalizedString::from("πΎπ π π ππ π£ππππ");
s.nfkc();
let original_slice = s.slice(Range::Original(0..4)).unwrap();
assert_eq!(original_slice.get(), "G");
assert_eq!(original_slice.get_original(), "πΎ");
let normalized_slice = s.slice(Range::Normalized(0..4)).unwrap();
assert_eq!(normalized_slice.get(), "Good");
assert_eq!(normalized_slice.get_original(), "πΎπ π π");
let mut s = NormalizedString::from(" Good Morning! ");
s.strip();
let slice = s.slice(Range::Original(..)).unwrap();
assert_eq!(
slice.get_range_original(Range::Normalized(0..4)),
Some("Good")
);
let slice = s.slice(Range::Normalized(..)).unwrap();
assert_eq!(
slice.get_range_original(Range::Normalized(0..4)),
Some("Good")
);
let slice = s.slice(Range::Original(4..15)).unwrap();
assert_eq!(
slice.get_range_original(Range::Normalized(0..3)),
Some("ood")
);
let slice = s.slice(Range::Original(3..16)).unwrap();
assert_eq!(
slice.get_range_original(Range::Normalized(0..4)),
Some("Good")
);
}
#[test]
fn replace() {
let mut s = NormalizedString::from(" Hello friend ");
s.replace(' ', "_").unwrap();
assert_eq!(s.get(), "_Hello___friend_");
let mut s = NormalizedString::from("aaaab");
s.replace('a', "b").unwrap();
assert_eq!(s.get(), "bbbbb");
let mut s = NormalizedString::from("aaaab");
s.replace("aaa", "b").unwrap();
assert_eq!(s.get(), "bab");
let mut s = NormalizedString::from(" Hello friend ");
let re = Regex::new(r"\s+").unwrap();
s.replace(&re, "_").unwrap();
assert_eq!(s.get(), "_Hello_friend_");
}
#[test]
fn split() {
use SplitDelimiterBehavior::*;
let s = NormalizedString::from("The-final--countdown");
let test = |behavior: SplitDelimiterBehavior, result: Vec<&str>| {
let splits = s.split('-', behavior).unwrap();
assert_eq!(splits.iter().map(|n| n.get()).collect::<Vec<_>>(), result);
};
test(Removed, vec!["The", "final", "countdown"]);
test(Isolated, vec!["The", "-", "final", "-", "-", "countdown"]);
test(MergedWithPrevious, vec!["The-", "final-", "-", "countdown"]);
test(MergedWithNext, vec!["The", "-final", "-", "-countdown"]);
test(Contiguous, vec!["The", "-", "final", "--", "countdown"]);
}
#[test]
fn transform_range_single_bytes() {
let s = NormalizedString::from("Hello friend");
let mut current = s.clone();
current.transform_range(Range::Original(0..4), vec![('Y', 0)], 3);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Yo friend".into(),
alignments: vec![
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 0),
(0, 0),
(0, 0),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9)
]
);
let mut current = s.clone();
current.transform_range(
Range::Original(3..10),
vec![('_', 0), ('F', 0), ('R', -2)],
2,
);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hel_FRnd".into(),
alignments: vec![
(0, 1),
(1, 2),
(2, 3),
(5, 6),
(6, 7),
(7, 8),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 3),
(3, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 6),
(6, 6),
(6, 7),
(7, 8)
]
);
let mut current = s.clone();
current.transform_range(Range::Original(5..), vec![('_', 0), ('F', -5)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hello_F".into(),
alignments: vec![(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 7),
(7, 7),
(7, 7),
(7, 7),
(7, 7)
]
);
let mut current = s.clone();
current.transform_range(Range::Original(0..1), vec![('H', 1), ('H', 0)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "HHello friend".into(),
alignments: vec![
(0, 0),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
let mut current = s.clone();
current.transform_range(Range::Original(0..0), vec![('H', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "HHello friend".into(),
alignments: vec![
(0, 0),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
let mut current = s.clone();
current.transform_range(Range::Original(0..1), vec![('H', 0), ('H', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "HHello friend".into(),
alignments: vec![
(0, 1),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
let mut current = s.clone();
current.transform_range(
Range::Original(5..6),
vec![('_', 0), ('m', 1), ('y', 1), ('_', 1)],
0,
);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hello_my_friend".into(),
alignments: vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(5, 6),
(5, 6),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13),
(13, 14),
(14, 15)
]
);
let mut current = s;
current.transform_range(Range::Original(11..), vec![('d', 0), ('_', 1), ('!', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hello friend_!".into(),
alignments: vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(11, 12),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 14)
]
);
}
#[test]
fn transform_range_multiple_bytes() {
let s = NormalizedString::from("πΎπ π π");
let mut current = s.clone();
current.transform_range(Range::Original(0..8), vec![('G', -1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "Gπ π".into(),
alignments: vec![
(0, 4),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(0, 1),
(0, 1),
(0, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 5),
(1, 5),
(1, 5),
(1, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "G");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "G");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
let mut current = s.clone();
current.transform_range(Range::Original(4..12), vec![('o', -1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎoπ".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 5),
(4, 5),
(4, 5),
(4, 5),
(5, 5),
(5, 5),
(5, 5),
(5, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9)
]
);
let mut current = s.clone();
current.transform_range(Range::Original(12..), vec![('d', 0), ('!', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎπ π d!".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
let mut current = s.clone();
current.transform_range(Range::Original(0..4), vec![('_', 1), ('πΎ', 0)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "_πΎπ π π".into(),
alignments: vec![
(0, 0),
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 5),
(1, 5),
(1, 5),
(1, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9),
(9, 13),
(9, 13),
(9, 13),
(9, 13),
(13, 17),
(13, 17),
(13, 17),
(13, 17)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "πΎπ ");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "πΎ");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
let mut current = s.clone();
current.transform_range(Range::Original(0..0), vec![('_', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "_πΎπ π π".into(),
alignments: vec![
(0, 0),
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 5),
(1, 5),
(1, 5),
(1, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9),
(9, 13),
(9, 13),
(9, 13),
(9, 13),
(13, 17),
(13, 17),
(13, 17),
(13, 17)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "πΎπ ");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "πΎ");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
let mut current = s.clone();
current.transform_range(Range::Original(0..4), vec![('πΎ', 0), ('o', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎoπ π π".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 5),
(0, 5),
(0, 5),
(0, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9),
(9, 13),
(9, 13),
(9, 13),
(9, 13),
(13, 17),
(13, 17),
(13, 17),
(13, 17)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "πΎoπ ");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "πΎo");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
let mut current = s.clone();
current.transform_range(
Range::Original(4..8),
vec![('π ', 0), ('o', 1), ('o', 1), ('o', 1)],
0,
);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎπ oooπ π".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 11),
(4, 11),
(4, 11),
(4, 11),
(11, 15),
(11, 15),
(11, 15),
(11, 15),
(15, 19),
(15, 19),
(15, 19),
(15, 19)
]
);
let mut current = s;
current.transform_range(Range::Original(16..), vec![('!', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎπ π π!".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 17),
(12, 17),
(12, 17),
(12, 17)
]
);
}
#[test]
fn transform_check() {
let mut s = NormalizedString::from("abcβ¦");
s.nfkd();
let transforms = vec![('a', -2), ('.', 0), ('.', 0), ('.', 0)];
s.transform(transforms, 0);
s.lowercase();
assert_eq!(s.get(), "a...");
}
}