markdown_that/common/
utils.rs

1//! Random assortment of functions that are used internally to write plugins.
2
3use entities;
4use regex::Regex;
5use std::borrow::Cow;
6use std::collections::HashMap;
7use std::sync::LazyLock;
8
9const UNESCAPE_MD_RE: &str = r##"\\([!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~])"##;
10const ENTITY_RE: &str = r##"&([A-Za-z#][A-Za-z0-9]{1,31});"##;
11
12static DIGITAL_ENTITY_TEST_RE: LazyLock<Regex> =
13    LazyLock::new(|| Regex::new(r#"(?i)^&#(x[a-f0-9]{1,8}|[0-9]{1,8});$"#).unwrap());
14static UNESCAPE_ALL_RE: LazyLock<Regex> =
15    LazyLock::new(|| Regex::new(&format!("{UNESCAPE_MD_RE}|{ENTITY_RE}")).unwrap());
16
17#[allow(clippy::manual_range_contains)]
18/// Return true if a `code` you got from `&#xHHHH;` entity is a valid charcode.
19///
20/// It returns false for surrogates and non-printables, so it's a subset of `char::from_u32`.
21/// For example, it returns false for 0xFDD0, which is a valid character, but not safe to
22/// render on the screen due to turning you into stone, as per <https://xkcd.com/380/>
23/// ```
24/// # use markdown_that::common::utils::is_valid_entity_code;
25/// assert_eq!(is_valid_entity_code(1), false);
26/// assert_eq!(is_valid_entity_code(32), true);
27/// ```
28pub fn is_valid_entity_code(code: u32) -> bool {
29    // broken sequence
30    if code >= 0xD800 && code <= 0xDFFF {
31        return false;
32    }
33    // never used
34    if code >= 0xFDD0 && code <= 0xFDEF {
35        return false;
36    }
37    if (code & 0xFFFF) == 0xFFFF || (code & 0xFFFF) == 0xFFFE {
38        return false;
39    }
40    // control codes
41    if code <= 0x08 {
42        return false;
43    }
44    if code == 0x0B {
45        return false;
46    }
47    if code >= 0x0E && code <= 0x1F {
48        return false;
49    }
50    if code >= 0x7F && code <= 0x9F {
51        return false;
52    }
53    // out of range
54    if code > 0x10FFFF {
55        return false;
56    }
57    true
58}
59
60/// Check if "&xxxx;" string is a valid HTML entity, return character it represents.
61/// ```
62/// # use markdown_that::common::utils::get_entity_from_str;
63/// assert_eq!(get_entity_from_str("&amp;"), Some("&"));
64/// assert_eq!(get_entity_from_str("&xxx;"), None);
65/// ```
66pub fn get_entity_from_str(str: &str) -> Option<&'static str> {
67    pub static ENTITIES_HASH: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
68        let mut mapping = HashMap::new();
69        for e in &entities::ENTITIES {
70            if e.entity.ends_with(';') {
71                mapping.insert(e.entity, e.characters);
72            }
73        }
74        mapping
75    });
76
77    ENTITIES_HASH.get(str).copied()
78}
79
80fn replace_entity_pattern(str: &str) -> Option<String> {
81    if let Some(entity) = get_entity_from_str(str) {
82        Some((*entity).to_owned())
83    } else if let Some(captures) = DIGITAL_ENTITY_TEST_RE.captures(str) {
84        let str = captures.get(1).unwrap().as_str();
85        #[allow(clippy::from_str_radix_10)]
86        let code = if str.starts_with('x') || str.starts_with('X') {
87            u32::from_str_radix(&str[1..], 16).unwrap()
88        } else {
89            u32::from_str_radix(str, 10).unwrap()
90        };
91
92        if is_valid_entity_code(code) {
93            Some(char::from_u32(code).unwrap().into())
94        } else {
95            None
96        }
97    } else {
98        None
99    }
100}
101
102/// Unescape both entities (`&quot; -> "`) and backslash escapes (`\" -> "`).
103/// ```
104/// # use markdown_that::common::utils::unescape_all;
105/// assert_eq!(unescape_all("&amp;"), "&");
106/// assert_eq!(unescape_all("\\&"), "&");
107/// ```
108pub fn unescape_all(str: &str) -> Cow<str> {
109    if !str.contains('\\') && !str.contains('&') {
110        return Cow::Borrowed(str);
111    }
112
113    UNESCAPE_ALL_RE.replace_all(str, |captures: &regex::Captures| {
114        let s = captures.get(0).unwrap().as_str();
115        if let Some(m) = captures.get(1) {
116            // \" -> "
117            m.as_str().to_owned()
118        } else if let Some(replacement) = replace_entity_pattern(s) {
119            // &quot; -> "
120            replacement
121        } else {
122            s.to_owned()
123        }
124    })
125}
126
127/// Escape `< > " &` with corresponding HTML entities;
128/// ```
129/// # use markdown_that::common::utils::escape_html;
130/// assert_eq!(escape_html("&\""), "&amp;&quot;");
131/// ```
132pub fn escape_html(str: &str) -> Cow<str> {
133    html_escape::encode_double_quoted_attribute(str)
134}
135
136/// Unicode case folding and space normalization, used for reference labels.
137///
138/// So that strings equal, according to the commonmark standard, are converted to
139/// the same string (lowercase/uppercase differences and spacing go away).
140/// ```
141/// # use markdown_that::common::utils::normalize_reference;
142/// assert_eq!(normalize_reference("hello"), normalize_reference("HELLO"));
143/// assert_eq!(normalize_reference("a   b"), normalize_reference("a b"));
144/// ```
145pub fn normalize_reference(str: &str) -> String {
146    static SPACE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+").unwrap());
147
148    // Trim and collapse whitespace
149    //
150    let str = SPACE_RE.replace_all(str.trim(), " ");
151
152    // .toLowerCase().toUpperCase() should get rid of all differences
153    // between letter variants.
154    //
155    // Simple .toLowerCase() doesn't normalize 125 code points correctly,
156    // and .toUpperCase doesn't normalize 6 of them (list of exceptions:
157    // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
158    // uppercased versions).
159    //
160    // Here's an example showing how it happens. Let's take greek letter omega:
161    // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
162    //
163    // Unicode entries:
164    // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8;
165    // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
166    // 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
167    // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
168    //
169    // Case-insensitive comparison should treat all of them as equivalent.
170    //
171    // But .toLowerCase() doesn't change ϑ (it's already lowercase),
172    // and .toUpperCase() doesn't change ϴ (already uppercase).
173    //
174    // Applying the first lower than upper case normalizes any character:
175    // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
176    //
177    // Note: this is equivalent to Unicode case folding; Unicode normalization
178    // is a different step not required here.
179    //
180    // The final result should be uppercased, because it's later stored in an object
181    // (this avoids a conflict with Object.prototype members,
182    // most notably, `__proto__`)
183    //
184    str.to_lowercase().to_uppercase()
185}
186
187/// Count number of characters since the last occurrence of `char`.
188///
189/// Finds last occurrence of `char` in `source`, returns number of characters from
190/// that last occurrence. If char is not found, return number of characters total.
191/// ```
192/// # use markdown_that::common::utils::rfind_and_count;
193/// assert_eq!(rfind_and_count("abcde", 'e'), 0);
194/// assert_eq!(rfind_and_count("abcde", 'b'), 3);
195/// assert_eq!(rfind_and_count("abcde", 'z'), 5);
196/// ```
197pub fn rfind_and_count(source: &str, char: char) -> usize {
198    let mut result = 0;
199    for c in source.chars().rev() {
200        if c == char {
201            break;
202        }
203        result += 1;
204    }
205    result
206}
207
208/// Calculate the number of spaces from `pos` to the first non-space character or EOL.
209///
210/// Tabs are expanded to a variable number of spaces with tabstop = 4.
211/// Returns relative indent and offset of first non-space character.
212/// ```
213/// # use markdown_that::common::utils::find_indent_of;
214/// assert_eq!(find_indent_of("\tfoo", 0), (4, 1));
215/// ```
216pub fn find_indent_of(line: &str, mut pos: usize) -> (usize, usize) {
217    let mut chars = line[pos..].chars();
218    let mut indent = 0;
219
220    loop {
221        match chars.next() {
222            Some('\t') => {
223                let bs_count = rfind_and_count(&line[..pos], '\t');
224                indent += 4 - bs_count % 4;
225                pos += 1;
226            }
227            Some(' ') => {
228                indent += 1;
229                pos += 1;
230            }
231            _ => return (indent, pos),
232        }
233    }
234}
235
236/// Returns trailing whitespace with total length of `indent`.
237///
238/// Input: a string of characters (presumed whitespaces, can be anything), where each one of
239/// them contributes 1 to indent (except for tabs, whose width may vary with tabstop = 4).
240///
241/// If an indent splits a tab, that tab is replaced with 4 spaces.
242///
243/// Example: cut_right_whitespace_with_tabstops("\t\t", 6) would return "  \t" (two preceding
244/// spaces) because first tab gets expanded to 6 spaces.
245/// ```
246/// # use markdown_that::common::utils::cut_right_whitespace_with_tabstops;
247/// assert_eq!(cut_right_whitespace_with_tabstops("\t\t", 6), "  \t");
248/// ```
249pub fn cut_right_whitespace_with_tabstops(source: &str, indent: i32) -> Cow<str> {
250    let (num_spaces, start) = calc_right_whitespace_with_tabstops(source, indent);
251
252    if num_spaces > 0 {
253        let mut result = " ".repeat(num_spaces);
254        result += &source[start..];
255        Cow::Owned(result)
256    } else {
257        Cow::Borrowed(&source[start..])
258    }
259}
260
261/// Calculate trailing whitespace with total length of `indent`.
262///
263/// See [cut_right_whitespace_with_tabstops](cut_right_whitespace_with_tabstops)
264/// for algorithm and details.
265///
266/// Returns number of spaces and number of bytes to cut from the end.
267/// ```
268/// # use markdown_that::common::utils::calc_right_whitespace_with_tabstops;
269/// assert_eq!(calc_right_whitespace_with_tabstops("\t\t", 6), (2, 1));
270/// ```
271pub fn calc_right_whitespace_with_tabstops(source: &str, mut indent: i32) -> (usize, usize) {
272    let mut start = source.len();
273    let mut chars = source.char_indices().rev();
274
275    while indent > 0 {
276        match chars.next() {
277            Some((pos, '\t')) => {
278                // the previous tab is guaranteed to finish at 0 modulo 4,
279                // so we can finish counting there
280                let indent_from_start = rfind_and_count(&source[..pos], '\t');
281                let tab_width = 4 - indent_from_start as i32 % 4;
282
283                if indent < tab_width {
284                    return (indent as usize, start);
285                }
286
287                indent -= tab_width;
288                start = pos;
289            }
290            Some((pos, _)) => {
291                indent -= 1;
292                start = pos;
293            }
294            None => {
295                start = 0;
296                break;
297            }
298        }
299    }
300
301    (0, start)
302}
303
304/// Checks whether a given character should count as punctuation
305///
306/// Used to determine word boundaries, made to match the implementation of
307/// `isPunctChar` from the JS library.
308/// This is currently implemented as a `match`, but might be simplified as a
309/// regex if benchmarking shows this to be beneficient.
310pub fn is_punct_char(ch: char) -> bool {
311    use unicode_general_category::GeneralCategory::*;
312    use unicode_general_category::get_general_category;
313
314    match get_general_category(ch) {
315        // P
316        ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation
317        | InitialPunctuation | FinalPunctuation | OtherPunctuation => true,
318        _ => false,
319    }
320}
321
322#[cfg(test)]
323mod tests {
324    use super::cut_right_whitespace_with_tabstops as cut_ws;
325    use super::find_indent_of;
326    use super::replace_entity_pattern;
327    use super::rfind_and_count;
328    use super::unescape_all;
329
330    #[test]
331    fn rfind_and_count_test() {
332        assert_eq!(rfind_and_count("", 'b'), 0);
333        assert_eq!(rfind_and_count("abcde", 'e'), 0);
334        assert_eq!(rfind_and_count("abcde", 'b'), 3);
335        assert_eq!(rfind_and_count("abcde", 'z'), 5);
336        assert_eq!(rfind_and_count("abcεπ", 'b'), 3);
337    }
338
339    #[test]
340    fn find_indent_of_simple_test() {
341        assert_eq!(find_indent_of("a", 0), (0, 0));
342        assert_eq!(find_indent_of(" a", 0), (1, 1));
343        assert_eq!(find_indent_of("   a", 0), (3, 3));
344        assert_eq!(find_indent_of("    ", 0), (4, 4));
345        assert_eq!(find_indent_of("\ta", 0), (4, 1));
346        assert_eq!(find_indent_of(" \ta", 0), (4, 2));
347        assert_eq!(find_indent_of("  \ta", 0), (4, 3));
348        assert_eq!(find_indent_of("   \ta", 0), (4, 4));
349        assert_eq!(find_indent_of("    \ta", 0), (8, 5));
350    }
351
352    #[test]
353    fn find_indent_of_with_offset() {
354        assert_eq!(find_indent_of("   a", 2), (1, 3));
355        assert_eq!(find_indent_of("    a", 2), (2, 4));
356        assert_eq!(find_indent_of("  \ta", 2), (2, 3));
357        assert_eq!(find_indent_of("   \ta", 2), (2, 4));
358        assert_eq!(find_indent_of("    \ta", 2), (6, 5));
359        assert_eq!(find_indent_of("     \ta", 2), (6, 6));
360    }
361
362    #[test]
363    fn find_indent_of_tabs_test() {
364        assert_eq!(find_indent_of("  \t \ta", 1), (7, 5));
365        assert_eq!(find_indent_of("  \t \ta", 2), (6, 5));
366        assert_eq!(find_indent_of("  \t \ta", 3), (4, 5));
367        assert_eq!(find_indent_of("  \t \ta", 4), (3, 5));
368    }
369
370    #[test]
371    fn cut_ws_simple() {
372        assert_eq!(cut_ws("abc", -1), "");
373        assert_eq!(cut_ws("abc", 0), "");
374        assert_eq!(cut_ws("abc", 1), "c");
375        assert_eq!(cut_ws("abc", 2), "bc");
376        assert_eq!(cut_ws("abc", 3), "abc");
377        assert_eq!(cut_ws("abc", 4), "abc");
378    }
379
380    #[test]
381    fn cut_ws_unicode() {
382        assert_eq!(cut_ws("αβγδ", 1), "δ");
383        assert_eq!(cut_ws("αβγδ ", 3), "γδ ");
384    }
385
386    #[test]
387    fn cut_ws_expands_partial_tabs() {
388        assert_eq!(cut_ws("\t", 1), " ");
389        assert_eq!(cut_ws("\t", 2), "  ");
390        assert_eq!(cut_ws("\t", 3), "   ");
391        assert_eq!(cut_ws("\t\t\t", 5), " \t");
392        assert_eq!(cut_ws("\t\t\t", 7), "   \t");
393    }
394
395    #[test]
396    fn cut_ws_retains_full_tabs() {
397        assert_eq!(cut_ws("\t\t\t", 4), "\t");
398        assert_eq!(cut_ws("\t\t\t", 8), "\t\t");
399    }
400
401    #[test]
402    fn cut_ws_proper_tabstops() {
403        assert_eq!(cut_ws("a\t", 1), " ");
404        assert_eq!(cut_ws("a\t", 2), "  ");
405        assert_eq!(cut_ws("a\t", 3), "\t");
406        assert_eq!(cut_ws("ab\t", 3), "b\t");
407        assert_eq!(cut_ws("abc\t", 3), "bc\t");
408    }
409
410    #[test]
411    fn cut_ws_proper_tabstops_nested() {
412        assert_eq!(cut_ws("a\tb\t", 2), "  ");
413        assert_eq!(cut_ws("a\tb\t", 3), "\t");
414        assert_eq!(cut_ws("a\tb\t", 4), "b\t");
415        assert_eq!(cut_ws("a\tb\t", 5), " b\t");
416        assert_eq!(cut_ws("a\tb\t", 6), "  b\t");
417        assert_eq!(cut_ws("a\tb\t", 7), "\tb\t");
418        assert_eq!(cut_ws("a\tb\t", 8), "a\tb\t");
419    }
420
421    #[test]
422    fn cut_ws_different_tabstops_nested() {
423        assert_eq!(cut_ws("abc\tde\tf\tg", 3), "  g");
424        assert_eq!(cut_ws("abc\tde\tf\tg", 4), "\tg");
425        assert_eq!(cut_ws("abc\tde\tf\tg", 5), "f\tg");
426        assert_eq!(cut_ws("abc\tde\tf\tg", 6), " f\tg");
427        assert_eq!(cut_ws("abc\tde\tf\tg", 7), "\tf\tg");
428        assert_eq!(cut_ws("abc\tde\tf\tg", 9), "de\tf\tg");
429        assert_eq!(cut_ws("abc\tde\tf\tg", 10), "\tde\tf\tg");
430    }
431
432    #[test]
433    fn test_replace_entity_pattern() {
434        assert_eq!(replace_entity_pattern("&amp;"), Some("&".into()));
435        assert_eq!(replace_entity_pattern("&euro;"), Some("€".into()));
436        assert_eq!(replace_entity_pattern("&#8212;"), Some("—".into()));
437        assert_eq!(replace_entity_pattern("&#x2014;"), Some("—".into()));
438        assert_eq!(replace_entity_pattern("&#X20;"), Some(" ".into()));
439        assert_eq!(replace_entity_pattern("&#x3F;"), Some("?".into()));
440        assert_eq!(replace_entity_pattern("&ffff;"), None);
441        assert_eq!(replace_entity_pattern("&#3F;"), None);
442        assert_eq!(replace_entity_pattern("&#xGG;"), None);
443    }
444
445    #[test]
446    fn test_unescape_all_simple() {
447        assert_eq!(unescape_all("&amp;"), "&");
448        assert_eq!(unescape_all("\\&"), "&");
449    }
450
451    #[test]
452    fn test_unescape_all_xss() {
453        assert_eq!(
454            unescape_all(r#"javascript&#x3A;alert(1)"#),
455            r#"javascript:alert(1)"#
456        );
457
458        assert_eq!(
459            unescape_all(r#"&#74;avascript:alert(1)"#),
460            r#"Javascript:alert(1)"#
461        );
462
463        assert_eq!(
464            unescape_all(r#"&#x26;#74;avascript:alert(1)"#),
465            r#"&#74;avascript:alert(1)"#
466        );
467
468        assert_eq!(
469            unescape_all(r#"\&#74;avascript:alert(1)"#),
470            r#"&#74;avascript:alert(1)"#
471        );
472
473        assert_eq!(
474            unescape_all(
475                r#"&#34;&#62;&#60;script&#62;alert&#40;&#34;xss&#34;&#41;&#60;/script&#62;"#
476            ),
477            r#""><script>alert("xss")</script>"#
478        );
479    }
480}