Skip to main content

whitespace_sifter/
lib.rs

1//! Sift duplicate whitespaces away in just one function call.
2//! This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a UTF-8 encoded `string`.\
3//! It naturally removes the whitespaces at the start and end of the `string`.
4//!
5//! # Examples
6//!
7//! ```rust
8//! use whitespace_sifter::WhitespaceSifter;
9//! // This prints `1.. 2.. 3.. 4.. 5..`.
10//! println!(
11//!     "{}",
12//!     "1.. \n2..  \n\r\n\n3..   \n\n\n4..    \n\n\r\n\n\n5..     \n\n\n\n\n".sift(),
13//! );
14//!
15//! // This prints `1..\n2..\n3..\n4..\r\n5..`.
16//! println!(
17//!     "{}",
18//!     "1.. \n2..  \n\r\n3..   \n\n\n4..    \r\n\n\r\n\n5..     \n\n\n\n\n"
19//!         .sift_preserve_newlines(),
20//! );
21//! ```
22
23mod character;
24mod sift;
25mod sift_preserve_newlines;
26mod unsafe_vec;
27
28use character::{get_char_metadata, Character, CARRIAGE_RETURN, LINE_FEED};
29use sift::sift_preallocated;
30use sift_preserve_newlines::sift_preallocated_until_newline;
31use unsafe_vec::{unsafe_custom_extend, unsafe_push};
32
33/// A trait containing all `string` whitespace-sifting functions.
34pub trait WhitespaceSifter: AsRef<str> {
35    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
36    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
37    /// This treats carriage-returns as just one `char` in the `string`.
38    #[must_use]
39    fn sift(&self) -> String {
40        let input: &str = self.as_ref();
41        let mut out: String = String::with_capacity(input.len());
42        sift_preallocated(input.as_ptr(), input.len(), unsafe { out.as_mut_vec() });
43        out
44    }
45
46    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
47    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
48    /// This preserves deduplicated newlines.
49    /// This treats carriage-returns as just one `char` in the `string`.
50    #[must_use]
51    fn sift_preserve_newlines(&self) -> String {
52        let input: &str = self.as_ref();
53        let in_ptr: *const u8 = input.as_ptr();
54        let in_len: usize = input.len();
55        let mut out: String = String::with_capacity(input.len());
56        let out_vec: &mut Vec<u8> = unsafe { out.as_mut_vec() };
57        let mut ind: usize = 0;
58        while ind < in_len {
59            sift_preallocated_until_newline(in_ptr, in_len, &mut ind, out_vec);
60        }
61        if out_vec.len() > 1 {
62            let new_out_mut_len: usize = unsafe { out_vec.len().unchecked_sub(2) };
63            if unsafe { out_vec.as_ptr().add(new_out_mut_len).read() } == CARRIAGE_RETURN {
64                unsafe { out_vec.set_len(new_out_mut_len) };
65                return out;
66            }
67            let new_out_mut_len: usize = unsafe { out_vec.len().unchecked_sub(1) };
68            if unsafe { out_vec.as_ptr().add(new_out_mut_len).read() } == LINE_FEED {
69                unsafe { out_vec.set_len(new_out_mut_len) };
70            }
71        }
72        out
73    }
74}
75
76impl<T: AsRef<str>> WhitespaceSifter for T {}
77
78/// A custom implementation of `str::trim_start`.
79#[allow(clippy::inline_always)]
80#[inline(always)]
81pub(crate) fn sift_trim_start(
82    in_ptr: *const u8,
83    in_len: usize,
84    ind: &mut usize,
85    out: &mut Vec<u8>,
86) {
87    while *ind < in_len {
88        match get_char_metadata(unsafe { in_ptr.add(*ind).read() }) {
89            Character::LineFeed | Character::CarriageReturn | Character::NormalWhitespace => {
90                *ind = unsafe { ind.unchecked_add(1) };
91            }
92            Character::SingleByte => {
93                unsafe { unsafe_push(out, in_ptr.add(*ind).read()) };
94                *ind = unsafe { ind.unchecked_add(1) };
95                break;
96            }
97            Character::MultiByte { len } => {
98                unsafe {
99                    unsafe_custom_extend(out, in_ptr.add(*ind), len as usize);
100                }
101                *ind = unsafe { ind.unchecked_add(len as usize) };
102                break;
103            }
104        }
105    }
106}
107
108/// A custom implementation for `str::trim_end`.
109#[allow(clippy::inline_always)]
110#[inline(always)]
111pub(crate) fn sift_trim_end(out: &mut Vec<u8>, is_last_whitespace: bool) {
112    if is_last_whitespace {
113        let new_out_len: usize = unsafe { out.len().unchecked_sub(1) };
114        unsafe { out.set_len(new_out_len) };
115    }
116}
117
118#[cfg(test)]
119mod tests;
120
121#[cfg(test)]
122mod msrv_test;
123
124#[cfg(test)]
125mod compliance_test;