1use entities;
4use regex::Regex;
5use std::borrow::Cow;
6use std::collections::HashMap;
7use std::sync::LazyLock;
8
9const UNESCAPE_MD_RE: &str = r##"\\([!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~])"##;
10const ENTITY_RE: &str = r##"&([A-Za-z#][A-Za-z0-9]{1,31});"##;
11
12static DIGITAL_ENTITY_TEST_RE: LazyLock<Regex> =
13 LazyLock::new(|| Regex::new(r#"(?i)^&#(x[a-f0-9]{1,8}|[0-9]{1,8});$"#).unwrap());
14static UNESCAPE_ALL_RE: LazyLock<Regex> =
15 LazyLock::new(|| Regex::new(&format!("{UNESCAPE_MD_RE}|{ENTITY_RE}")).unwrap());
16
17#[allow(clippy::manual_range_contains)]
18pub fn is_valid_entity_code(code: u32) -> bool {
29 if code >= 0xD800 && code <= 0xDFFF {
31 return false;
32 }
33 if code >= 0xFDD0 && code <= 0xFDEF {
35 return false;
36 }
37 if (code & 0xFFFF) == 0xFFFF || (code & 0xFFFF) == 0xFFFE {
38 return false;
39 }
40 if code <= 0x08 {
42 return false;
43 }
44 if code == 0x0B {
45 return false;
46 }
47 if code >= 0x0E && code <= 0x1F {
48 return false;
49 }
50 if code >= 0x7F && code <= 0x9F {
51 return false;
52 }
53 if code > 0x10FFFF {
55 return false;
56 }
57 true
58}
59
60pub fn get_entity_from_str(str: &str) -> Option<&'static str> {
67 pub static ENTITIES_HASH: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
68 let mut mapping = HashMap::new();
69 for e in &entities::ENTITIES {
70 if e.entity.ends_with(';') {
71 mapping.insert(e.entity, e.characters);
72 }
73 }
74 mapping
75 });
76
77 ENTITIES_HASH.get(str).copied()
78}
79
80fn replace_entity_pattern(str: &str) -> Option<String> {
81 if let Some(entity) = get_entity_from_str(str) {
82 Some((*entity).to_owned())
83 } else if let Some(captures) = DIGITAL_ENTITY_TEST_RE.captures(str) {
84 let str = captures.get(1).unwrap().as_str();
85 #[allow(clippy::from_str_radix_10)]
86 let code = if str.starts_with('x') || str.starts_with('X') {
87 u32::from_str_radix(&str[1..], 16).unwrap()
88 } else {
89 u32::from_str_radix(str, 10).unwrap()
90 };
91
92 if is_valid_entity_code(code) {
93 Some(char::from_u32(code).unwrap().into())
94 } else {
95 None
96 }
97 } else {
98 None
99 }
100}
101
102pub fn unescape_all(str: &str) -> Cow<str> {
109 if !str.contains('\\') && !str.contains('&') {
110 return Cow::Borrowed(str);
111 }
112
113 UNESCAPE_ALL_RE.replace_all(str, |captures: ®ex::Captures| {
114 let s = captures.get(0).unwrap().as_str();
115 if let Some(m) = captures.get(1) {
116 m.as_str().to_owned()
118 } else if let Some(replacement) = replace_entity_pattern(s) {
119 replacement
121 } else {
122 s.to_owned()
123 }
124 })
125}
126
127pub fn escape_html(str: &str) -> Cow<str> {
133 html_escape::encode_double_quoted_attribute(str)
134}
135
136pub fn normalize_reference(str: &str) -> String {
146 static SPACE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+").unwrap());
147
148 let str = SPACE_RE.replace_all(str.trim(), " ");
151
152 str.to_lowercase().to_uppercase()
185}
186
187pub fn rfind_and_count(source: &str, char: char) -> usize {
198 let mut result = 0;
199 for c in source.chars().rev() {
200 if c == char {
201 break;
202 }
203 result += 1;
204 }
205 result
206}
207
208pub fn find_indent_of(line: &str, mut pos: usize) -> (usize, usize) {
217 let mut chars = line[pos..].chars();
218 let mut indent = 0;
219
220 loop {
221 match chars.next() {
222 Some('\t') => {
223 let bs_count = rfind_and_count(&line[..pos], '\t');
224 indent += 4 - bs_count % 4;
225 pos += 1;
226 }
227 Some(' ') => {
228 indent += 1;
229 pos += 1;
230 }
231 _ => return (indent, pos),
232 }
233 }
234}
235
236pub fn cut_right_whitespace_with_tabstops(source: &str, indent: i32) -> Cow<str> {
250 let (num_spaces, start) = calc_right_whitespace_with_tabstops(source, indent);
251
252 if num_spaces > 0 {
253 let mut result = " ".repeat(num_spaces);
254 result += &source[start..];
255 Cow::Owned(result)
256 } else {
257 Cow::Borrowed(&source[start..])
258 }
259}
260
261pub fn calc_right_whitespace_with_tabstops(source: &str, mut indent: i32) -> (usize, usize) {
272 let mut start = source.len();
273 let mut chars = source.char_indices().rev();
274
275 while indent > 0 {
276 match chars.next() {
277 Some((pos, '\t')) => {
278 let indent_from_start = rfind_and_count(&source[..pos], '\t');
281 let tab_width = 4 - indent_from_start as i32 % 4;
282
283 if indent < tab_width {
284 return (indent as usize, start);
285 }
286
287 indent -= tab_width;
288 start = pos;
289 }
290 Some((pos, _)) => {
291 indent -= 1;
292 start = pos;
293 }
294 None => {
295 start = 0;
296 break;
297 }
298 }
299 }
300
301 (0, start)
302}
303
304pub fn is_punct_char(ch: char) -> bool {
311 use unicode_general_category::GeneralCategory::*;
312 use unicode_general_category::get_general_category;
313
314 match get_general_category(ch) {
315 ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation
317 | InitialPunctuation | FinalPunctuation | OtherPunctuation => true,
318 _ => false,
319 }
320}
321
322#[cfg(test)]
323mod tests {
324 use super::cut_right_whitespace_with_tabstops as cut_ws;
325 use super::find_indent_of;
326 use super::replace_entity_pattern;
327 use super::rfind_and_count;
328 use super::unescape_all;
329
330 #[test]
331 fn rfind_and_count_test() {
332 assert_eq!(rfind_and_count("", 'b'), 0);
333 assert_eq!(rfind_and_count("abcde", 'e'), 0);
334 assert_eq!(rfind_and_count("abcde", 'b'), 3);
335 assert_eq!(rfind_and_count("abcde", 'z'), 5);
336 assert_eq!(rfind_and_count("abcεπ", 'b'), 3);
337 }
338
339 #[test]
340 fn find_indent_of_simple_test() {
341 assert_eq!(find_indent_of("a", 0), (0, 0));
342 assert_eq!(find_indent_of(" a", 0), (1, 1));
343 assert_eq!(find_indent_of(" a", 0), (3, 3));
344 assert_eq!(find_indent_of(" ", 0), (4, 4));
345 assert_eq!(find_indent_of("\ta", 0), (4, 1));
346 assert_eq!(find_indent_of(" \ta", 0), (4, 2));
347 assert_eq!(find_indent_of(" \ta", 0), (4, 3));
348 assert_eq!(find_indent_of(" \ta", 0), (4, 4));
349 assert_eq!(find_indent_of(" \ta", 0), (8, 5));
350 }
351
352 #[test]
353 fn find_indent_of_with_offset() {
354 assert_eq!(find_indent_of(" a", 2), (1, 3));
355 assert_eq!(find_indent_of(" a", 2), (2, 4));
356 assert_eq!(find_indent_of(" \ta", 2), (2, 3));
357 assert_eq!(find_indent_of(" \ta", 2), (2, 4));
358 assert_eq!(find_indent_of(" \ta", 2), (6, 5));
359 assert_eq!(find_indent_of(" \ta", 2), (6, 6));
360 }
361
362 #[test]
363 fn find_indent_of_tabs_test() {
364 assert_eq!(find_indent_of(" \t \ta", 1), (7, 5));
365 assert_eq!(find_indent_of(" \t \ta", 2), (6, 5));
366 assert_eq!(find_indent_of(" \t \ta", 3), (4, 5));
367 assert_eq!(find_indent_of(" \t \ta", 4), (3, 5));
368 }
369
370 #[test]
371 fn cut_ws_simple() {
372 assert_eq!(cut_ws("abc", -1), "");
373 assert_eq!(cut_ws("abc", 0), "");
374 assert_eq!(cut_ws("abc", 1), "c");
375 assert_eq!(cut_ws("abc", 2), "bc");
376 assert_eq!(cut_ws("abc", 3), "abc");
377 assert_eq!(cut_ws("abc", 4), "abc");
378 }
379
380 #[test]
381 fn cut_ws_unicode() {
382 assert_eq!(cut_ws("αβγδ", 1), "δ");
383 assert_eq!(cut_ws("αβγδ ", 3), "γδ ");
384 }
385
386 #[test]
387 fn cut_ws_expands_partial_tabs() {
388 assert_eq!(cut_ws("\t", 1), " ");
389 assert_eq!(cut_ws("\t", 2), " ");
390 assert_eq!(cut_ws("\t", 3), " ");
391 assert_eq!(cut_ws("\t\t\t", 5), " \t");
392 assert_eq!(cut_ws("\t\t\t", 7), " \t");
393 }
394
395 #[test]
396 fn cut_ws_retains_full_tabs() {
397 assert_eq!(cut_ws("\t\t\t", 4), "\t");
398 assert_eq!(cut_ws("\t\t\t", 8), "\t\t");
399 }
400
401 #[test]
402 fn cut_ws_proper_tabstops() {
403 assert_eq!(cut_ws("a\t", 1), " ");
404 assert_eq!(cut_ws("a\t", 2), " ");
405 assert_eq!(cut_ws("a\t", 3), "\t");
406 assert_eq!(cut_ws("ab\t", 3), "b\t");
407 assert_eq!(cut_ws("abc\t", 3), "bc\t");
408 }
409
410 #[test]
411 fn cut_ws_proper_tabstops_nested() {
412 assert_eq!(cut_ws("a\tb\t", 2), " ");
413 assert_eq!(cut_ws("a\tb\t", 3), "\t");
414 assert_eq!(cut_ws("a\tb\t", 4), "b\t");
415 assert_eq!(cut_ws("a\tb\t", 5), " b\t");
416 assert_eq!(cut_ws("a\tb\t", 6), " b\t");
417 assert_eq!(cut_ws("a\tb\t", 7), "\tb\t");
418 assert_eq!(cut_ws("a\tb\t", 8), "a\tb\t");
419 }
420
421 #[test]
422 fn cut_ws_different_tabstops_nested() {
423 assert_eq!(cut_ws("abc\tde\tf\tg", 3), " g");
424 assert_eq!(cut_ws("abc\tde\tf\tg", 4), "\tg");
425 assert_eq!(cut_ws("abc\tde\tf\tg", 5), "f\tg");
426 assert_eq!(cut_ws("abc\tde\tf\tg", 6), " f\tg");
427 assert_eq!(cut_ws("abc\tde\tf\tg", 7), "\tf\tg");
428 assert_eq!(cut_ws("abc\tde\tf\tg", 9), "de\tf\tg");
429 assert_eq!(cut_ws("abc\tde\tf\tg", 10), "\tde\tf\tg");
430 }
431
432 #[test]
433 fn test_replace_entity_pattern() {
434 assert_eq!(replace_entity_pattern("&"), Some("&".into()));
435 assert_eq!(replace_entity_pattern("€"), Some("€".into()));
436 assert_eq!(replace_entity_pattern("—"), Some("—".into()));
437 assert_eq!(replace_entity_pattern("—"), Some("—".into()));
438 assert_eq!(replace_entity_pattern(" "), Some(" ".into()));
439 assert_eq!(replace_entity_pattern("?"), Some("?".into()));
440 assert_eq!(replace_entity_pattern("&ffff;"), None);
441 assert_eq!(replace_entity_pattern("F;"), None);
442 assert_eq!(replace_entity_pattern("&#xGG;"), None);
443 }
444
445 #[test]
446 fn test_unescape_all_simple() {
447 assert_eq!(unescape_all("&"), "&");
448 assert_eq!(unescape_all("\\&"), "&");
449 }
450
451 #[test]
452 fn test_unescape_all_xss() {
453 assert_eq!(
454 unescape_all(r#"javascript:alert(1)"#),
455 r#"javascript:alert(1)"#
456 );
457
458 assert_eq!(
459 unescape_all(r#"Javascript:alert(1)"#),
460 r#"Javascript:alert(1)"#
461 );
462
463 assert_eq!(
464 unescape_all(r#"&#74;avascript:alert(1)"#),
465 r#"Javascript:alert(1)"#
466 );
467
468 assert_eq!(
469 unescape_all(r#"\Javascript:alert(1)"#),
470 r#"Javascript:alert(1)"#
471 );
472
473 assert_eq!(
474 unescape_all(
475 r#""><script>alert("xss")</script>"#
476 ),
477 r#""><script>alert("xss")</script>"#
478 );
479 }
480}