html2md/extended/
sifter.rs1use auto_encoder::auto_encode_bytes;
2use std::str;
3
4enum Character {
6 SingleByte { data: u8 },
7 MultiByte { len: usize },
8}
9
10pub trait WhitespaceSifter: AsRef<str> {
12 #[must_use]
16 fn sift(&self) -> String {
17 let input: &str = self.as_ref();
18 let mut out: String = String::with_capacity(input.len());
19 sift_preallocated(input.as_bytes(), &mut out);
20 out
21 }
22
23 #[must_use]
28 fn sift_preserve_newlines(&self) -> String {
29 let input = self.as_ref();
30 let mut out = String::with_capacity(input.len());
31 let bytes = input.as_bytes();
32 let mut ind: usize = 0;
33
34 while ind < bytes.len() {
35 sift_preallocated_until_newline(bytes, &mut ind, &mut out);
36 }
37
38 if out.ends_with("\r\n") {
39 let _ = out.pop();
40 let _ = out.pop();
41 } else if out.ends_with('\n') {
42 let _ = out.pop();
43 }
44
45 out
46 }
47}
48
49pub trait WhitespaceSifterBytes: AsRef<[u8]> {
51 #[must_use]
54 fn sift_bytes(&self) -> String {
55 let input = self.as_ref();
56 let mut out: String = String::with_capacity(input.len());
57 sift_preallocated(input, &mut out);
58 out
59 }
60
61 #[must_use]
64 fn sift_bytes_preserve_newlines(&self) -> String {
65 let bytes = self.as_ref();
66 let mut out = String::with_capacity(bytes.len());
67 let mut ind: usize = 0;
68
69 while ind < bytes.len() {
70 sift_preallocated_until_newline(bytes, &mut ind, &mut out);
71 }
72
73 if out.ends_with("\r\n") {
74 let _ = out.pop();
75 let _ = out.pop();
76 } else if out.ends_with('\n') {
77 let _ = out.pop();
78 }
79
80 out
81 }
82}
83
84impl<T: AsRef<str>> WhitespaceSifter for T {}
85impl<T: AsRef<[u8]>> WhitespaceSifterBytes for T {}
86
87fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
89 while *ind < bytes.len() {
90 match get_char_metadata(bytes[*ind]) {
91 Character::SingleByte { data } => {
92 *ind += 1;
93 if !is_ascii_whitespace(data) {
94 out.push(data as char);
95 break;
96 }
97 }
98 Character::MultiByte { len } => {
99 extend_from_bytes_with_len(bytes, ind, out, len);
100 break;
101 }
102 }
103 }
104}
105
106fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
108 if is_last_whitespace {
109 out.pop();
110 }
111}
112
113fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
115 let end = ind.saturating_add(len);
116 if *ind <= end && end <= bytes.len() {
118 let output = auto_encode_bytes(&bytes[*ind..end]);
119 out.push_str(&output);
120 }
121 *ind = end;
122}
123
124#[inline]
125const fn is_newline(codepoint: u8) -> bool {
126 matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
127}
128
129fn sift_preallocated(bytes: &[u8], out: &mut String) {
131 if !bytes.is_empty() {
132 let mut ind: usize = 0;
133 sift_trim_start(bytes, &mut ind, out);
134 let mut is_last_whitespace: bool = false;
135 let mut is_last_carriage_return: bool = false;
136
137 while ind < bytes.len() {
138 match get_char_metadata(bytes[ind]) {
139 Character::SingleByte { data } => {
140 ind += 1;
141 if is_ascii_whitespace(data) {
142 if data == LINE_FEED && is_last_carriage_return {
143 out.push('\n');
144 is_last_carriage_return = false;
145 continue;
146 }
147 if is_last_whitespace {
148 continue;
149 }
150 is_last_whitespace = true;
151 } else {
152 is_last_whitespace = false;
153 }
154 out.push(data as char);
155 is_last_carriage_return = data == CARRIAGE_RETURN;
156 }
157 Character::MultiByte { len } => {
158 extend_from_bytes_with_len(bytes, &mut ind, out, len);
159 }
160 }
161 is_last_carriage_return = false;
162 }
163 sift_trim_end(out, is_last_whitespace);
164 }
165}
166
167fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
169 sift_trim_start(bytes, ind, out);
170
171 let mut is_last_whitespace = false;
172 let mut is_last_carriage_return = false;
173
174 while *ind < bytes.len() {
175 match get_char_metadata(bytes[*ind]) {
176 Character::SingleByte { data } => {
177 *ind += 1;
178 if is_ascii_whitespace(data) {
179 if is_newline(data) {
180 if is_last_carriage_return {
181 out.push('\r');
182 }
183 out.push('\n');
184 return;
185 }
186 is_last_carriage_return = data == CARRIAGE_RETURN;
187 if is_last_whitespace {
188 continue;
189 }
190 is_last_whitespace = true;
191 } else {
192 is_last_whitespace = false;
193 }
194 out.push(data as char);
195 }
196 Character::MultiByte { len } => {
197 extend_from_bytes_with_len(bytes, ind, out, len);
198 }
199 }
200 is_last_carriage_return = false;
201 }
202 sift_trim_end(out, is_last_whitespace);
203}
204
205#[inline]
207const fn get_char_metadata(first_byte: u8) -> Character {
208 match first_byte {
209 0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
210 0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
211 0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
212 0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
213 }
214}
215
216#[allow(clippy::cast_possible_truncation)]
217const SPACE: u8 = ' ' as u32 as u8;
218#[allow(clippy::cast_possible_truncation)]
219const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
220#[allow(clippy::cast_possible_truncation)]
221const LINE_FEED: u8 = '\n' as u32 as u8;
222#[allow(clippy::cast_possible_truncation)]
223const FORM_FEED: u8 = '\x0C' as u32 as u8;
224#[allow(clippy::cast_possible_truncation)]
225const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;
226
227#[inline]
229const fn is_ascii_whitespace(codepoint: u8) -> bool {
230 matches!(
231 codepoint,
232 SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
233 )
234}