markdown_it/common/
utils.rs

1//! Random assortment of functions that's used internally to write plugins.
2
3use entities;
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::borrow::Cow;
7use std::collections::HashMap;
8
9const UNESCAPE_MD_RE : &str = r##"\\([!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~])"##;
10const ENTITY_RE      : &str = r##"&([A-Za-z#][A-Za-z0-9]{1,31});"##;
11
12static DIGITAL_ENTITY_TEST_RE : Lazy<Regex> = Lazy::new(||
13    Regex::new(r#"(?i)^&#(x[a-f0-9]{1,8}|[0-9]{1,8});$"#).unwrap()
14);
15static UNESCAPE_ALL_RE        : Lazy<Regex> = Lazy::new(||
16    Regex::new(&format!("{UNESCAPE_MD_RE}|{ENTITY_RE}")).unwrap()
17);
18
19#[allow(clippy::manual_range_contains)]
20/// Return true if a `code` you got from `&#xHHHH;` entity is a valid charcode.
21///
22/// It returns false for surrogates and non-printables, so it's a subset of `char::from_u32`.
23/// For example, it returns false for 0xFDD0, which is a valid character, but not safe to
24/// render on the screen due to turning you into stone, as per <https://xkcd.com/380/>
25/// ```
26/// # use markdown_it::common::utils::is_valid_entity_code;
27/// assert_eq!(is_valid_entity_code(1), false);
28/// assert_eq!(is_valid_entity_code(32), true);
29/// ```
30pub fn is_valid_entity_code(code: u32) -> bool {
31    // broken sequence
32    if code >= 0xD800 && code <= 0xDFFF { return false; }
33    // never used
34    if code >= 0xFDD0 && code <= 0xFDEF { return false; }
35    if (code & 0xFFFF) == 0xFFFF || (code & 0xFFFF) == 0xFFFE { return false; }
36    // control codes
37    if code <= 0x08 { return false; }
38    if code == 0x0B { return false; }
39    if code >= 0x0E && code <= 0x1F { return false; }
40    if code >= 0x7F && code <= 0x9F { return false; }
41    // out of range
42    if code > 0x10FFFF { return false; }
43    true
44}
45
46/// Check if "&xxxx;" string is a valid HTML entity, return character it represents.
47/// ```
48/// # use markdown_it::common::utils::get_entity_from_str;
49/// assert_eq!(get_entity_from_str("&amp;"), Some("&"));
50/// assert_eq!(get_entity_from_str("&xxx;"), None);
51/// ```
52pub fn get_entity_from_str(str: &str) -> Option<&'static str> {
53    pub static ENTITIES_HASH : Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
54        let mut mapping = HashMap::new();
55        for e in &entities::ENTITIES {
56            if e.entity.ends_with(';') {
57                mapping.insert(e.entity, e.characters);
58            }
59        }
60        mapping
61    });
62
63    ENTITIES_HASH.get(str).copied()
64}
65
66#[allow(clippy::from_str_radix_10)]
67fn replace_entity_pattern(str: &str) -> Option<String> {
68    if let Some(entity) = get_entity_from_str(str) {
69        Some((*entity).to_owned())
70    } else if let Some(captures) = DIGITAL_ENTITY_TEST_RE.captures(str) {
71        let str = captures.get(1).unwrap().as_str();
72        let code = if str.starts_with('x') || str.starts_with('X') {
73            u32::from_str_radix(&str[1..], 16).unwrap()
74        } else {
75            u32::from_str_radix(str, 10).unwrap()
76        };
77
78        if is_valid_entity_code(code) {
79            Some(char::from_u32(code).unwrap().into())
80        } else {
81            None
82        }
83    } else {
84        None
85    }
86}
87
88/// Unescape both entities (`&quot; -> "`) and backslash escapes (`\" -> "`).
89/// ```
90/// # use markdown_it::common::utils::unescape_all;
91/// assert_eq!(unescape_all("&amp;"), "&");
92/// assert_eq!(unescape_all("\\&"), "&");
93/// ```
94pub fn unescape_all(str: &str) -> Cow<str> {
95    if !str.contains('\\') && !str.contains('&') { return Cow::Borrowed(str); }
96
97    UNESCAPE_ALL_RE.replace_all(str, |captures: &regex::Captures| {
98        let s = captures.get(0).unwrap().as_str();
99        if let Some(m) = captures.get(1) {
100            // \" -> "
101            m.as_str().to_owned()
102        } else if let Some(replacement) = replace_entity_pattern(s) {
103            // &quot; -> "
104            replacement
105        } else {
106            s.to_owned()
107        }
108    })
109}
110
111/// Escape `" < > &` with corresponding HTML entities;
112/// ```
113/// # use markdown_it::common::utils::escape_html;
114/// assert_eq!(escape_html("&\""), "&amp;&quot;");
115/// ```
116pub fn escape_html(str: &str) -> Cow<str> {
117    html_escape::encode_double_quoted_attribute(str)
118}
119
120/// Unicode case folding + space normalization, used for for reference labels.
121///
122/// So that strings equal according to commonmark standard are converted to
123/// the same string (lowercase/uppercase differences and spacing go away).
124/// ```
125/// # use markdown_it::common::utils::normalize_reference;
126/// assert_eq!(normalize_reference("hello"), normalize_reference("HELLO"));
127/// assert_eq!(normalize_reference("a   b"), normalize_reference("a b"));
128/// ```
129pub fn normalize_reference(str: &str) -> String {
130    static SPACE_RE : Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
131
132    // Trim and collapse whitespace
133    //
134    let str = SPACE_RE.replace_all(str.trim(), " ");
135
136    // .toLowerCase().toUpperCase() should get rid of all differences
137    // between letter variants.
138    //
139    // Simple .toLowerCase() doesn't normalize 125 code points correctly,
140    // and .toUpperCase doesn't normalize 6 of them (list of exceptions:
141    // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
142    // uppercased versions).
143    //
144    // Here's an example showing how it happens. Lets take greek letter omega:
145    // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
146    //
147    // Unicode entries:
148    // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8;
149    // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
150    // 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
151    // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
152    //
153    // Case-insensitive comparison should treat all of them as equivalent.
154    //
155    // But .toLowerCase() doesn't change ϑ (it's already lowercase),
156    // and .toUpperCase() doesn't change ϴ (already uppercase).
157    //
158    // Applying first lower then upper case normalizes any character:
159    // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
160    //
161    // Note: this is equivalent to unicode case folding; unicode normalization
162    // is a different step that is not required here.
163    //
164    // Final result should be uppercased, because it's later stored in an object
165    // (this avoid a conflict with Object.prototype members,
166    // most notably, `__proto__`)
167    //
168    str.to_lowercase().to_uppercase()
169}
170
171/// Count number of characters since last occurrence of `char`.
172///
173/// Finds last occurrence of `char` in `source`, returns number of characters from
174/// that last occurrence. If char is not found, return number of characters total.
175/// ```
176/// # use markdown_it::common::utils::rfind_and_count;
177/// assert_eq!(rfind_and_count("abcde", 'e'), 0);
178/// assert_eq!(rfind_and_count("abcde", 'b'), 3);
179/// assert_eq!(rfind_and_count("abcde", 'z'), 5);
180/// ```
181pub fn rfind_and_count(source: &str, char: char) -> usize {
182    let mut result = 0;
183    for c in source.chars().rev() {
184        if c == char { break; }
185        result += 1;
186    }
187    result
188}
189
190/// Calculate number of spaces from `pos` to first non-space character or EOL.
191///
192/// Tabs are expanded to variable number of spaces with tabstop = 4.
193/// Returns relative indent and offset of first non-space character.
194/// ```
195/// # use markdown_it::common::utils::find_indent_of;
196/// assert_eq!(find_indent_of("\tfoo", 0), (4, 1));
197/// ```
198pub fn find_indent_of(line: &str, mut pos: usize) -> (usize, usize) {
199    let mut chars = line[pos..].chars();
200    let mut indent = 0;
201
202    loop {
203        match chars.next() {
204            Some('\t') => {
205                let bs_count = rfind_and_count(&line[..pos], '\t');
206                indent += 4 - bs_count % 4;
207                pos += 1;
208            }
209            Some(' ') => {
210                indent += 1;
211                pos += 1;
212            }
213            _ => return ( indent, pos ),
214        }
215    }
216}
217
218/// Returns trailing whitespace with total length of `indent`.
219///
220/// Input: a string of characters (presumed whitespaces, can be anything), where each one of
221/// them contributes 1 to indent (except for tabs, whose width may vary with tabstop = 4).
222///
223/// If an indent would split a tab, that tab is replaced with 4 spaces.
224///
225/// Example: cut_right_whitespace_with_tabstops("\t\t", 6) would return "  \t" (two preceding
226/// spaces) because first tab gets expanded to 6 spaces.
227/// ```
228/// # use markdown_it::common::utils::cut_right_whitespace_with_tabstops;
229/// assert_eq!(cut_right_whitespace_with_tabstops("\t\t", 6), "  \t");
230/// ```
231pub fn cut_right_whitespace_with_tabstops(source: &str, indent: i32) -> Cow<str> {
232    let (num_spaces, start) = calc_right_whitespace_with_tabstops(source, indent);
233
234    if num_spaces > 0 {
235        let mut result = " ".repeat(num_spaces);
236        result += &source[start..];
237        Cow::Owned(result)
238    } else {
239        Cow::Borrowed(&source[start..])
240    }
241}
242
243/// Calculate trailing whitespace with total length of `indent`.
244///
245/// See [cut_right_whitespace_with_tabstops](cut_right_whitespace_with_tabstops)
246/// for algorithm and details.
247///
248/// Returns number of spaces + number of bytes to cut from the end.
249/// ```
250/// # use markdown_it::common::utils::calc_right_whitespace_with_tabstops;
251/// assert_eq!(calc_right_whitespace_with_tabstops("\t\t", 6), (2, 1));
252/// ```
253pub fn calc_right_whitespace_with_tabstops(source: &str, mut indent: i32) -> (usize, usize) {
254    let mut start = source.len();
255    let mut chars = source.char_indices().rev();
256
257    while indent > 0 {
258        match chars.next() {
259            Some((pos, '\t')) => {
260                // previous tab is guaranteed to finish at 0 modulo 4,
261                // so we can finish counting there
262                let indent_from_start = rfind_and_count(&source[..pos], '\t');
263                let tab_width = 4 - indent_from_start as i32 % 4;
264
265                if indent < tab_width {
266                    return ( indent as usize, start );
267                }
268
269                indent -= tab_width;
270                start = pos;
271            }
272            Some((pos, _)) => {
273                indent -= 1;
274                start = pos;
275            }
276            None => {
277                start = 0;
278                break;
279            }
280        }
281    }
282
283    ( 0, start )
284}
285
286/// Checks whether a given character should count as punctuation
287///
288/// used to determine word boundaries, made to match the implementation of
289/// `isPunctChar` from the JS library.
290/// This is currently implemented as a `match`, but might be simplified as a
291/// regex if benchmarking shows this to be beneficient.
292pub fn is_punct_char(ch: char) -> bool {
293    use unicode_general_category::get_general_category;
294    use unicode_general_category::GeneralCategory::*;
295
296    match get_general_category(ch) {
297        // P
298        ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation |
299        InitialPunctuation | FinalPunctuation | OtherPunctuation => true,
300
301        // L
302        UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter |
303        // M
304        NonspacingMark | SpacingMark | EnclosingMark |
305        // N
306        DecimalNumber | LetterNumber | OtherNumber |
307        // S
308        MathSymbol | CurrencySymbol | ModifierSymbol | OtherSymbol |
309        // Z
310        SpaceSeparator | LineSeparator | ParagraphSeparator |
311        // C
312        Control | Format | Surrogate | PrivateUse | Unassigned => false
313    }
314}
315
316#[cfg(test)]
317mod tests {
318    use super::cut_right_whitespace_with_tabstops as cut_ws;
319    use super::rfind_and_count;
320    use super::find_indent_of;
321    use super::replace_entity_pattern;
322    use super::unescape_all;
323
324    #[test]
325    fn rfind_and_count_test() {
326        assert_eq!(rfind_and_count("", 'b'), 0);
327        assert_eq!(rfind_and_count("abcde", 'e'), 0);
328        assert_eq!(rfind_and_count("abcde", 'b'), 3);
329        assert_eq!(rfind_and_count("abcde", 'z'), 5);
330        assert_eq!(rfind_and_count("abcεπ", 'b'), 3);
331    }
332
333    #[test]
334    fn find_indent_of_simple_test() {
335        assert_eq!(find_indent_of("a", 0), (0, 0));
336        assert_eq!(find_indent_of(" a", 0), (1, 1));
337        assert_eq!(find_indent_of("   a", 0), (3, 3));
338        assert_eq!(find_indent_of("    ", 0), (4, 4));
339        assert_eq!(find_indent_of("\ta", 0), (4, 1));
340        assert_eq!(find_indent_of(" \ta", 0), (4, 2));
341        assert_eq!(find_indent_of("  \ta", 0), (4, 3));
342        assert_eq!(find_indent_of("   \ta", 0), (4, 4));
343        assert_eq!(find_indent_of("    \ta", 0), (8, 5));
344    }
345
346    #[test]
347    fn find_indent_of_with_offset() {
348        assert_eq!(find_indent_of("   a", 2), (1, 3));
349        assert_eq!(find_indent_of("    a", 2), (2, 4));
350        assert_eq!(find_indent_of("  \ta", 2), (2, 3));
351        assert_eq!(find_indent_of("   \ta", 2), (2, 4));
352        assert_eq!(find_indent_of("    \ta", 2), (6, 5));
353        assert_eq!(find_indent_of("     \ta", 2), (6, 6));
354    }
355
356    #[test]
357    fn find_indent_of_tabs_test() {
358        assert_eq!(find_indent_of("  \t \ta", 1), (7, 5));
359        assert_eq!(find_indent_of("  \t \ta", 2), (6, 5));
360        assert_eq!(find_indent_of("  \t \ta", 3), (4, 5));
361        assert_eq!(find_indent_of("  \t \ta", 4), (3, 5));
362    }
363
364    #[test]
365    fn cut_ws_simple() {
366        assert_eq!(cut_ws("abc", -1), "");
367        assert_eq!(cut_ws("abc", 0), "");
368        assert_eq!(cut_ws("abc", 1), "c");
369        assert_eq!(cut_ws("abc", 2), "bc");
370        assert_eq!(cut_ws("abc", 3), "abc");
371        assert_eq!(cut_ws("abc", 4), "abc");
372    }
373
374    #[test]
375    fn cut_ws_unicode() {
376        assert_eq!(cut_ws("αβγδ", 1), "δ");
377        assert_eq!(cut_ws("αβγδ ", 3), "γδ ");
378    }
379
380    #[test]
381    fn cut_ws_expands_partial_tabs() {
382        assert_eq!(cut_ws("\t", 1), " ");
383        assert_eq!(cut_ws("\t", 2), "  ");
384        assert_eq!(cut_ws("\t", 3), "   ");
385        assert_eq!(cut_ws("\t\t\t", 5), " \t");
386        assert_eq!(cut_ws("\t\t\t", 7), "   \t");
387    }
388
389    #[test]
390    fn cut_ws_retains_full_tabs() {
391        assert_eq!(cut_ws("\t\t\t", 4), "\t");
392        assert_eq!(cut_ws("\t\t\t", 8), "\t\t");
393    }
394
395    #[test]
396    fn cut_ws_proper_tabstops() {
397        assert_eq!(cut_ws("a\t", 1), " ");
398        assert_eq!(cut_ws("a\t", 2), "  ");
399        assert_eq!(cut_ws("a\t", 3), "\t");
400        assert_eq!(cut_ws("ab\t", 3), "b\t");
401        assert_eq!(cut_ws("abc\t", 3), "bc\t");
402    }
403
404    #[test]
405    fn cut_ws_proper_tabstops_nested() {
406        assert_eq!(cut_ws("a\tb\t", 2), "  ");
407        assert_eq!(cut_ws("a\tb\t", 3), "\t");
408        assert_eq!(cut_ws("a\tb\t", 4), "b\t");
409        assert_eq!(cut_ws("a\tb\t", 5), " b\t");
410        assert_eq!(cut_ws("a\tb\t", 6), "  b\t");
411        assert_eq!(cut_ws("a\tb\t", 7), "\tb\t");
412        assert_eq!(cut_ws("a\tb\t", 8), "a\tb\t");
413    }
414
415    #[test]
416    fn cut_ws_different_tabstops_nested() {
417        assert_eq!(cut_ws("abc\tde\tf\tg", 3), "  g");
418        assert_eq!(cut_ws("abc\tde\tf\tg", 4), "\tg");
419        assert_eq!(cut_ws("abc\tde\tf\tg", 5), "f\tg");
420        assert_eq!(cut_ws("abc\tde\tf\tg", 6), " f\tg");
421        assert_eq!(cut_ws("abc\tde\tf\tg", 7), "\tf\tg");
422        assert_eq!(cut_ws("abc\tde\tf\tg", 9), "de\tf\tg");
423        assert_eq!(cut_ws("abc\tde\tf\tg", 10), "\tde\tf\tg");
424    }
425
426    #[test]
427    fn test_replace_entity_pattern() {
428        assert_eq!(replace_entity_pattern("&amp;"), Some("&".into()));
429        assert_eq!(replace_entity_pattern("&euro;"), Some("€".into()));
430        assert_eq!(replace_entity_pattern("&#8212;"), Some("—".into()));
431        assert_eq!(replace_entity_pattern("&#x2014;"), Some("—".into()));
432        assert_eq!(replace_entity_pattern("&#X20;"), Some(" ".into()));
433        assert_eq!(replace_entity_pattern("&#x3F;"), Some("?".into()));
434        assert_eq!(replace_entity_pattern("&ffff;"), None);
435        assert_eq!(replace_entity_pattern("&#3F;"), None);
436        assert_eq!(replace_entity_pattern("&#xGG;"), None);
437    }
438
439    #[test]
440    fn test_unescape_all_simple() {
441        assert_eq!(unescape_all("&amp;"), "&");
442        assert_eq!(unescape_all("\\&"), "&");
443    }
444
445    #[test]
446    fn test_unescape_all_xss() {
447        assert_eq!(
448            unescape_all(r#"javascript&#x3A;alert(1)"#),
449            r#"javascript:alert(1)"#);
450
451        assert_eq!(
452            unescape_all(r#"&#74;avascript:alert(1)"#),
453            r#"Javascript:alert(1)"#);
454
455        assert_eq!(
456            unescape_all(r#"&#x26;#74;avascript:alert(1)"#),
457            r#"&#74;avascript:alert(1)"#);
458
459        assert_eq!(
460            unescape_all(r#"\&#74;avascript:alert(1)"#),
461            r#"&#74;avascript:alert(1)"#);
462
463        assert_eq!(
464            unescape_all(r#"&#34;&#62;&#60;script&#62;alert&#40;&#34;xss&#34;&#41;&#60;/script&#62;"#),
465            r#""><script>alert("xss")</script>"#);
466    }
467}