html2md/extended/
sifter.rs

1use auto_encoder::auto_encode_bytes;
2use std::str;
3
4/// Charector handling bytes.
5enum Character {
6    SingleByte { data: u8 },
7    MultiByte { len: usize },
8}
9
10/// A trait containing all `string` whitespace-sifting functions.
11pub trait WhitespaceSifter: AsRef<str> {
12    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
13    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
14    /// This treats carriage-returns as just one `char` in the `string`.
15    #[must_use]
16    fn sift(&self) -> String {
17        let input: &str = self.as_ref();
18        let mut out: String = String::with_capacity(input.len());
19        sift_preallocated(input.as_bytes(), &mut out);
20        out
21    }
22
23    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
24    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
25    /// This preserves deduplicated newlines.
26    /// This treats carriage-returns as just one `char` in the `string`.
27    #[must_use]
28    fn sift_preserve_newlines(&self) -> String {
29        let input = self.as_ref();
30        let mut out = String::with_capacity(input.len());
31        let bytes = input.as_bytes();
32        let mut ind: usize = 0;
33
34        while ind < bytes.len() {
35            sift_preallocated_until_newline(bytes, &mut ind, &mut out);
36        }
37
38        if out.ends_with("\r\n") {
39            let _ = out.pop();
40            let _ = out.pop();
41        } else if out.ends_with('\n') {
42            let _ = out.pop();
43        }
44
45        out
46    }
47}
48
49/// A trait containing all `Vec<u8>` whitespace-sifting functions.
50pub trait WhitespaceSifterBytes: AsRef<[u8]> {
51    /// This removes duplicate whitespaces from a `Vec<u8>`.
52    /// It supports the same whitespace definition as [char::is_ascii_whitespace].
53    #[must_use]
54    fn sift_bytes(&self) -> String {
55        let input = self.as_ref();
56        let mut out: String = String::with_capacity(input.len());
57        sift_preallocated(input, &mut out);
58        out
59    }
60
61    /// This removes duplicate whitespaces from a `Vec<u8>`.
62    /// It preserves deduplicated newlines.
63    #[must_use]
64    fn sift_bytes_preserve_newlines(&self) -> String {
65        let bytes = self.as_ref();
66        let mut out = String::with_capacity(bytes.len());
67        let mut ind: usize = 0;
68
69        while ind < bytes.len() {
70            sift_preallocated_until_newline(bytes, &mut ind, &mut out);
71        }
72
73        if out.ends_with("\r\n") {
74            let _ = out.pop();
75            let _ = out.pop();
76        } else if out.ends_with('\n') {
77            let _ = out.pop();
78        }
79
80        out
81    }
82}
83
84impl<T: AsRef<str>> WhitespaceSifter for T {}
85impl<T: AsRef<[u8]>> WhitespaceSifterBytes for T {}
86
87/// A custom implementation of `str::trim_start`.
88fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
89    while *ind < bytes.len() {
90        match get_char_metadata(bytes[*ind]) {
91            Character::SingleByte { data } => {
92                *ind += 1;
93                if !is_ascii_whitespace(data) {
94                    out.push(data as char);
95                    break;
96                }
97            }
98            Character::MultiByte { len } => {
99                extend_from_bytes_with_len(bytes, ind, out, len);
100                break;
101            }
102        }
103    }
104}
105
106/// A custom implementation for `str::trim_end`.
107fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
108    if is_last_whitespace {
109        out.pop();
110    }
111}
112
113/// Extend the bytes from a slice.
114fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
115    let end = ind.saturating_add(len);
116    // Check bounds to ensure we don't run into an out-of-bounds error.
117    if *ind <= end && end <= bytes.len() {
118        let output = auto_encode_bytes(&bytes[*ind..end]);
119        out.push_str(&output);
120    }
121    *ind = end;
122}
123
124#[inline]
125const fn is_newline(codepoint: u8) -> bool {
126    matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
127}
128
129/// Sift preallocate safe strings.
130fn sift_preallocated(bytes: &[u8], out: &mut String) {
131    if !bytes.is_empty() {
132        let mut ind: usize = 0;
133        sift_trim_start(bytes, &mut ind, out);
134        let mut is_last_whitespace: bool = false;
135        let mut is_last_carriage_return: bool = false;
136
137        while ind < bytes.len() {
138            match get_char_metadata(bytes[ind]) {
139                Character::SingleByte { data } => {
140                    ind += 1;
141                    if is_ascii_whitespace(data) {
142                        if data == LINE_FEED && is_last_carriage_return {
143                            out.push('\n');
144                            is_last_carriage_return = false;
145                            continue;
146                        }
147                        if is_last_whitespace {
148                            continue;
149                        }
150                        is_last_whitespace = true;
151                    } else {
152                        is_last_whitespace = false;
153                    }
154                    out.push(data as char);
155                    is_last_carriage_return = data == CARRIAGE_RETURN;
156                }
157                Character::MultiByte { len } => {
158                    extend_from_bytes_with_len(bytes, &mut ind, out, len);
159                }
160            }
161            is_last_carriage_return = false;
162        }
163        sift_trim_end(out, is_last_whitespace);
164    }
165}
166
167/// Sift preallocate until complete.
168fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
169    sift_trim_start(bytes, ind, out);
170
171    let mut is_last_whitespace = false;
172    let mut is_last_carriage_return = false;
173
174    while *ind < bytes.len() {
175        match get_char_metadata(bytes[*ind]) {
176            Character::SingleByte { data } => {
177                *ind += 1;
178                if is_ascii_whitespace(data) {
179                    if is_newline(data) {
180                        if is_last_carriage_return {
181                            out.push('\r');
182                        }
183                        out.push('\n');
184                        return;
185                    }
186                    is_last_carriage_return = data == CARRIAGE_RETURN;
187                    if is_last_whitespace {
188                        continue;
189                    }
190                    is_last_whitespace = true;
191                } else {
192                    is_last_whitespace = false;
193                }
194                out.push(data as char);
195            }
196            Character::MultiByte { len } => {
197                extend_from_bytes_with_len(bytes, ind, out, len);
198            }
199        }
200        is_last_carriage_return = false;
201    }
202    sift_trim_end(out, is_last_whitespace);
203}
204
205/// Binary extracted from [std](https://doc.rust-lang.org/src/core/str/validations.rs.html#36).
206#[inline]
207const fn get_char_metadata(first_byte: u8) -> Character {
208    match first_byte {
209        0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
210        0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
211        0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
212        0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
213    }
214}
215
216#[allow(clippy::cast_possible_truncation)]
217const SPACE: u8 = ' ' as u32 as u8;
218#[allow(clippy::cast_possible_truncation)]
219const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
220#[allow(clippy::cast_possible_truncation)]
221const LINE_FEED: u8 = '\n' as u32 as u8;
222#[allow(clippy::cast_possible_truncation)]
223const FORM_FEED: u8 = '\x0C' as u32 as u8;
224#[allow(clippy::cast_possible_truncation)]
225const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;
226
227/// Values extracted from [std](https://doc.rust-lang.org/src/core/char/methods.rs.html#1680).
228#[inline]
229const fn is_ascii_whitespace(codepoint: u8) -> bool {
230    matches!(
231        codepoint,
232        SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
233    )
234}