Skip to main content

htmd/
text_util.rs

1pub(crate) trait TrimDocumentWhitespace {
2    fn trim_document_whitespace(&self) -> &str;
3
4    fn trim_start_document_whitespace(&self) -> &str;
5
6    fn trim_end_document_whitespace(&self) -> &str;
7}
8
9impl<S> TrimDocumentWhitespace for S
10where
11    S: AsRef<str>,
12{
13    #[inline]
14    fn trim_document_whitespace(&self) -> &str {
15        self.as_ref().trim_matches(is_document_whitespace)
16    }
17
18    #[inline]
19    fn trim_start_document_whitespace(&self) -> &str {
20        self.as_ref().trim_start_matches(is_document_whitespace)
21    }
22
23    #[inline]
24    fn trim_end_document_whitespace(&self) -> &str {
25        self.as_ref().trim_end_matches(is_document_whitespace)
26    }
27}
28
29pub(crate) trait StripWhitespace {
30    /// Strip leading whitespace.
31    ///
32    /// A tuple of (striped_text, Option<leading_whitespace>) will be returned.
33    fn strip_leading_document_whitespace(&self) -> (&str, Option<&str>);
34    fn strip_leading_whitespace(&self) -> (&str, Option<&str>);
35
36    /// Strip trailing whitespace.
37    ///
38    /// A tuple of (striped_text, Option<trailing_whitespace>) will be returned.
39    fn strip_trailing_document_whitespace(&self) -> (&str, Option<&str>);
40    fn strip_trailing_whitespace(&self) -> (&str, Option<&str>);
41}
42
43impl<S> StripWhitespace for S
44where
45    S: AsRef<str>,
46{
47    fn strip_leading_whitespace(&self) -> (&str, Option<&str>) {
48        let text = self.as_ref();
49        let trimmed_text = text.trim_start();
50        let stripped_len = text.len() - trimmed_text.len();
51        if stripped_len == 0 {
52            (text, None)
53        } else {
54            let start_index = stripped_len;
55            (&text[start_index..], Some(&text[..start_index]))
56        }
57    }
58
59    fn strip_leading_document_whitespace(&self) -> (&str, Option<&str>) {
60        let text = self.as_ref();
61        let trimmed_text = text.trim_start_document_whitespace();
62        let stripped_len = text.len() - trimmed_text.len();
63        if stripped_len == 0 {
64            (text, None)
65        } else {
66            let start_index = stripped_len;
67            (&text[start_index..], Some(&text[..start_index]))
68        }
69    }
70
71    fn strip_trailing_whitespace(&self) -> (&str, Option<&str>) {
72        let text = self.as_ref();
73        let trimmed_text = text.trim_end();
74        let stripped_len = text.len() - trimmed_text.len();
75        if stripped_len == 0 {
76            (text, None)
77        } else {
78            let end_index = trimmed_text.len();
79            (&text[..end_index], Some(&text[end_index..]))
80        }
81    }
82
83    fn strip_trailing_document_whitespace(&self) -> (&str, Option<&str>) {
84        let text = self.as_ref();
85        let trimmed_text = text.trim_end_document_whitespace();
86        let stripped_len = text.len() - trimmed_text.len();
87        if stripped_len == 0 {
88            (text, None)
89        } else {
90            let end_index = trimmed_text.len();
91            (&text[..end_index], Some(&text[end_index..]))
92        }
93    }
94}
95
96pub(crate) trait JoinOnStringIterator {
97    fn join<S: AsRef<str>>(&mut self, separator: S) -> String;
98}
99
100impl<T, S> JoinOnStringIterator for T
101where
102    S: AsRef<str>,
103    T: Iterator<Item = S>,
104{
105    fn join<SE: AsRef<str>>(&mut self, separator: SE) -> String {
106        let Some(first) = self.next() else {
107            return String::new();
108        };
109        let separator = separator.as_ref();
110        let mut result = String::from(first.as_ref());
111        for next in self {
112            result.push_str(separator);
113            result.push_str(next.as_ref());
114        }
115        result
116    }
117}
118
119/// Join text clips, inspired by:
120/// https://github.com/mixmark-io/turndown/blob/cc73387fb707e5fb5e1083e94078d08f38f3abc8/src/turndown.js#L221
121pub(crate) fn join_blocks(contents: &[String]) -> String {
122    // Pre-allocate capacity to avoid multiple re-allocations.
123    let capacity = contents.iter().map(String::len).sum();
124    let mut result = String::with_capacity(capacity);
125
126    for content in contents {
127        let content_len = content.len();
128        if content_len == 0 {
129            continue;
130        }
131
132        let result_len = result.len();
133        let left = result.trim_end_matches('\n');
134        let right = content.trim_start_matches('\n');
135
136        let max_trimmed_new_lines =
137            std::cmp::max(result_len - left.len(), content_len - right.len());
138        let separator_new_lines = std::cmp::min(max_trimmed_new_lines, 2);
139
140        // Remove trailing newlines.
141        result.truncate(left.len());
142
143        // Add the calculated separator
144        if separator_new_lines == 1 {
145            result.push('\n');
146        } else if separator_new_lines == 2 {
147            result.push_str("\n\n");
148        }
149
150        // Append the new, trimmed content
151        result.push_str(right);
152    }
153    result
154}
155
156pub(crate) fn compress_whitespace(input: &str) -> Cow<'_, str> {
157    if input.is_empty() {
158        return Cow::Borrowed(input);
159    }
160
161    let mut result: Option<String> = None;
162    let mut in_whitespace = false;
163
164    // Use char_indices to get byte indices for slicing the input.
165    for (byte_index, c) in input.char_indices() {
166        if c.is_ascii_whitespace() {
167            if in_whitespace {
168                // Consecutive whitespace: skip this character.
169                if result.is_none() {
170                    // Lazy allocation: First change found. Allocate and copy the prefix.
171                    let mut s = String::with_capacity(input.len());
172                    s.push_str(&input[..byte_index]);
173                    result = Some(s);
174                }
175            } else {
176                // First whitespace in sequence.
177                in_whitespace = true;
178                if c == ' ' {
179                    // Valid single space. If already allocating, append it.
180                    if let Some(res) = &mut result {
181                        res.push(' ');
182                    }
183                } else {
184                    // Non-space whitespace (e.g., \n): must be changed to ' '.
185                    if result.is_none() {
186                        // Lazy allocation: First change found. Allocate and copy the prefix.
187                        let mut s = String::with_capacity(input.len());
188                        s.push_str(&input[..byte_index]);
189                        result = Some(s);
190                    }
191                    result.as_mut().unwrap().push(' ');
192                }
193            }
194        } else {
195            // Not whitespace.
196            in_whitespace = false;
197            // If already allocating, append the character.
198            if let Some(res) = &mut result {
199                res.push(c);
200            }
201        }
202    }
203
204    // If `result` is None, return Cow::Borrowed (no changes were made).
205    match result {
206        Some(s) => Cow::Owned(s),
207        None => Cow::Borrowed(input),
208    }
209}
210
211// Per [MDN](https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace),
212// document white space characters only include spaces, tabs, line
213// feeds, and newlines. Remove only these from the end of a line.
214fn is_document_whitespace(c: char) -> bool {
215    ['\t', '\n', '\r', ' '].contains(&c)
216}
217
218pub(crate) fn indent_text_except_first_line(
219    text: &str,
220    indent: usize,
221    trim_line_end: bool,
222) -> String {
223    if indent == 0 {
224        return text.to_string();
225    }
226    let line_count = text.lines().count();
227    let estimated_capacity = text.len() + (line_count.saturating_sub(1)) * indent;
228    let mut result = String::with_capacity(estimated_capacity);
229    let indent_text = " ".repeat(indent);
230    for (idx, line) in text.lines().enumerate() {
231        let line = if trim_line_end {
232            line.trim_end_matches(is_document_whitespace)
233        } else {
234            line
235        };
236        if idx > 0 {
237            result.push('\n');
238        }
239        if idx == 0 || line.is_empty() {
240            result.push_str(line);
241        } else {
242            result.push_str(&concat_strings!(indent_text, line));
243        }
244    }
245    result
246}
247
248pub(crate) fn is_markdown_atx_heading(text: &str) -> bool {
249    let mut is_prev_ch_hash = false;
250    for ch in text.chars() {
251        if ch == '#' {
252            is_prev_ch_hash = true;
253        } else if ch == ' ' {
254            return is_prev_ch_hash;
255        } else {
256            return false;
257        }
258    }
259    false
260}
261
262pub(crate) fn index_of_markdown_ordered_item_dot(text: &str) -> Option<usize> {
263    let mut is_prev_ch_numeric = false;
264    let mut dot_byte_offset = 0;
265    let mut is_prev_ch_dot = false;
266    for (byte_offset, ch) in text.char_indices() {
267        if ch.is_numeric() {
268            if is_prev_ch_dot {
269                return None;
270            }
271            is_prev_ch_numeric = true;
272        } else if ch == '.' {
273            if !is_prev_ch_numeric {
274                return None;
275            }
276            dot_byte_offset = byte_offset;
277            is_prev_ch_dot = true;
278        } else if ch == ' ' {
279            if is_prev_ch_dot {
280                return Some(dot_byte_offset);
281            } else {
282                return None;
283            }
284        } else {
285            return None;
286        }
287    }
288    None
289}
290
291macro_rules! concat_strings {
292    ($($x:expr),*) => {{
293        let mut len = 0;
294        $(
295            len += &$x.len();
296        )*
297        let mut result = String::with_capacity(len);
298        $(
299            result.push_str(&$x);
300        )*
301        result
302    }};
303}
304use std::borrow::Cow;
305
306pub(crate) use concat_strings;
307
308#[cfg(test)]
309mod tests {
310    use super::index_of_markdown_ordered_item_dot;
311
312    #[test]
313    fn test_index_of_markdown_ordered_item_dot() {
314        assert_eq!(None, index_of_markdown_ordered_item_dot("16.1¾ "));
315        assert_eq!(Some(1), index_of_markdown_ordered_item_dot("1. "));
316        assert_eq!(Some(2), index_of_markdown_ordered_item_dot("12. "));
317        assert_eq!(Some(5), index_of_markdown_ordered_item_dot("12345. "));
318        assert_eq!(Some(1), index_of_markdown_ordered_item_dot("1. \n"));
319        assert_eq!(None, index_of_markdown_ordered_item_dot(". "));
320        assert_eq!(None, index_of_markdown_ordered_item_dot("abc. "));
321        assert_eq!(None, index_of_markdown_ordered_item_dot("1 . "));
322        assert_eq!(None, index_of_markdown_ordered_item_dot(" 1. "));
323        assert_eq!(None, index_of_markdown_ordered_item_dot("1.a "));
324        assert_eq!(None, index_of_markdown_ordered_item_dot("1."));
325    }
326
327    #[test]
328    fn test_index_of_markdown_ordered_item_dot_multibyte() {
329        // U+00BD (½) is 2 bytes in UTF-8: the dot byte offset is 3, not 2
330        assert_eq!(Some(3), index_of_markdown_ordered_item_dot("2½. text"));
331        // No dot, should return None
332        assert_eq!(None, index_of_markdown_ordered_item_dot("2½"));
333    }
334}