html2md/extended/
sifter.rs

1use auto_encoder::auto_encode_bytes;
2use std::str;
3
4/// Charector handling bytes.
5enum Character {
6    SingleByte { data: u8 },
7    MultiByte { len: usize },
8}
9
10/// A trait containing all `string` whitespace-sifting functions.
11pub trait WhitespaceSifter: AsRef<str> {
12    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
13    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
14    /// This treats carriage-returns as just one `char` in the `string`.
15    #[must_use]
16    fn sift(&self) -> String {
17        let input: &str = self.as_ref();
18        let mut out: String = String::with_capacity(input.len());
19        sift_preallocated(input.as_bytes(), &mut out);
20        out
21    }
22
23    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
24    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
25    /// This preserves deduplicated newlines.
26    /// This treats carriage-returns as just one `char` in the `string`.
27    #[must_use]
28    fn sift_preserve_newlines(&self) -> String {
29        let input = self.as_ref();
30        let mut out = String::with_capacity(input.len());
31        let bytes = input.as_bytes();
32        let mut ind: usize = 0;
33
34        while ind < bytes.len() {
35            sift_preallocated_until_newline(bytes, &mut ind, &mut out);
36        }
37
38        if out.ends_with("\r\n") {
39            let _ = out.pop();
40            let _ = out.pop();
41        } else if out.ends_with('\n') {
42            let _ = out.pop();
43        }
44
45        out
46    }
47}
48
49/// A trait containing all `Vec<u8>` whitespace-sifting functions.
50pub trait WhitespaceSifterBytes: AsRef<[u8]> {
51    /// This removes duplicate whitespaces from a `Vec<u8>`.
52    /// It supports the same whitespace definition as [char::is_ascii_whitespace].
53    #[must_use]
54    fn sift_bytes(&self) -> String {
55        let input = self.as_ref();
56        let mut out: String = String::with_capacity(input.len());
57        sift_preallocated(input, &mut out);
58        out
59    }
60
61    /// This removes duplicate whitespaces from a `Vec<u8>`.
62    /// It preserves deduplicated newlines.
63    #[must_use]
64    fn sift_bytes_preserve_newlines(&self) -> String {
65        let bytes = self.as_ref();
66        let mut out = String::with_capacity(bytes.len());
67        let mut ind: usize = 0;
68
69        while ind < bytes.len() {
70            sift_preallocated_until_newline(bytes, &mut ind, &mut out);
71        }
72
73        if out.ends_with("\r\n") {
74            let _ = out.pop();
75            let _ = out.pop();
76        } else if out.ends_with('\n') {
77            let _ = out.pop();
78        }
79
80        out
81    }
82}
83
84impl<T: AsRef<str>> WhitespaceSifter for T {}
85impl<T: AsRef<[u8]>> WhitespaceSifterBytes for T {}
86
87/// A custom implementation of `str::trim_start`.
88fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
89    while *ind < bytes.len() {
90        match get_char_metadata(bytes[*ind]) {
91            Character::SingleByte { data } => {
92                *ind += 1;
93                if !is_ascii_whitespace(data) {
94                    out.push(data as char);
95                    break;
96                }
97            }
98            Character::MultiByte { len } => {
99                extend_from_bytes_with_len(bytes, ind, out, len);
100                break;
101            }
102        }
103    }
104}
105
106/// A custom implementation for `str::trim_end`.
107fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
108    if is_last_whitespace {
109        out.pop();
110    }
111}
112
113/// Extend the bytes from a slice.
114fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
115    let end = ind.saturating_add(len);
116    // Check bounds to ensure we don't run into an out-of-bounds error.
117    if *ind <= end && end <= bytes.len() {
118        let output = auto_encode_bytes(&bytes[*ind..end]);
119        out.push_str(&output);
120    }
121    *ind = end;
122}
123
124#[inline]
125const fn is_newline(codepoint: u8) -> bool {
126    matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
127}
128
129/// Sift preallocate safe strings.
130fn sift_preallocated(bytes: &[u8], out: &mut String) {
131    if !bytes.is_empty() {
132        let mut ind: usize = 0;
133        sift_trim_start(bytes, &mut ind, out);
134        let mut is_last_whitespace: bool = false;
135        let mut is_last_carriage_return: bool = false;
136
137        while ind < bytes.len() {
138            match get_char_metadata(bytes[ind]) {
139                Character::SingleByte { data } => {
140                    ind += 1;
141                    if is_ascii_whitespace(data) {
142                        if data == LINE_FEED && is_last_carriage_return {
143                            out.push('\n');
144                            is_last_carriage_return = false;
145                            continue;
146                        }
147                        if is_last_whitespace {
148                            continue;
149                        }
150                        is_last_whitespace = true;
151                    } else {
152                        is_last_whitespace = false;
153                    }
154                    out.push(data as char);
155                }
156                Character::MultiByte { len } => {
157                    extend_from_bytes_with_len(bytes, &mut ind, out, len);
158                }
159            }
160            is_last_carriage_return = false;
161        }
162        sift_trim_end(out, is_last_whitespace);
163    }
164}
165
166/// Sift preallocate until complete.
167fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
168    sift_trim_start(bytes, ind, out);
169
170    let mut is_last_whitespace = false;
171    let mut is_last_carriage_return = false;
172
173    while *ind < bytes.len() {
174        match get_char_metadata(bytes[*ind]) {
175            Character::SingleByte { data } => {
176                *ind += 1;
177                if is_ascii_whitespace(data) {
178                    if is_newline(data) {
179                        if is_last_carriage_return {
180                            out.push('\r');
181                        }
182                        out.push('\n');
183                        return;
184                    }
185                    is_last_carriage_return = data == CARRIAGE_RETURN;
186                    if is_last_whitespace {
187                        continue;
188                    }
189                    is_last_whitespace = true;
190                } else {
191                    is_last_whitespace = false;
192                }
193                out.push(data as char);
194            }
195            Character::MultiByte { len } => {
196                extend_from_bytes_with_len(bytes, ind, out, len);
197            }
198        }
199        is_last_carriage_return = false;
200    }
201    sift_trim_end(out, is_last_whitespace);
202}
203
204/// Binary extracted from [std](https://doc.rust-lang.org/src/core/str/validations.rs.html#36).
205#[inline]
206const fn get_char_metadata(first_byte: u8) -> Character {
207    match first_byte {
208        0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
209        0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
210        0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
211        0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
212    }
213}
214
215#[allow(clippy::cast_possible_truncation)]
216const SPACE: u8 = ' ' as u32 as u8;
217#[allow(clippy::cast_possible_truncation)]
218const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
219#[allow(clippy::cast_possible_truncation)]
220const LINE_FEED: u8 = '\n' as u32 as u8;
221#[allow(clippy::cast_possible_truncation)]
222const FORM_FEED: u8 = '\x0C' as u32 as u8;
223#[allow(clippy::cast_possible_truncation)]
224const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;
225
226/// Values extracted from [std](https://doc.rust-lang.org/src/core/char/methods.rs.html#1680).
227#[inline]
228const fn is_ascii_whitespace(codepoint: u8) -> bool {
229    matches!(
230        codepoint,
231        SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
232    )
233}