1use entities;
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::borrow::Cow;
7use std::collections::HashMap;
8
9const UNESCAPE_MD_RE : &str = r##"\\([!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~])"##;
10const ENTITY_RE : &str = r##"&([A-Za-z#][A-Za-z0-9]{1,31});"##;
11
12static DIGITAL_ENTITY_TEST_RE : Lazy<Regex> = Lazy::new(||
13 Regex::new(r#"(?i)^&#(x[a-f0-9]{1,8}|[0-9]{1,8});$"#).unwrap()
14);
15static UNESCAPE_ALL_RE : Lazy<Regex> = Lazy::new(||
16 Regex::new(&format!("{UNESCAPE_MD_RE}|{ENTITY_RE}")).unwrap()
17);
18
19#[allow(clippy::manual_range_contains)]
20pub fn is_valid_entity_code(code: u32) -> bool {
31 if code >= 0xD800 && code <= 0xDFFF { return false; }
33 if code >= 0xFDD0 && code <= 0xFDEF { return false; }
35 if (code & 0xFFFF) == 0xFFFF || (code & 0xFFFF) == 0xFFFE { return false; }
36 if code <= 0x08 { return false; }
38 if code == 0x0B { return false; }
39 if code >= 0x0E && code <= 0x1F { return false; }
40 if code >= 0x7F && code <= 0x9F { return false; }
41 if code > 0x10FFFF { return false; }
43 true
44}
45
46pub fn get_entity_from_str(str: &str) -> Option<&'static str> {
53 pub static ENTITIES_HASH : Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
54 let mut mapping = HashMap::new();
55 for e in &entities::ENTITIES {
56 if e.entity.ends_with(';') {
57 mapping.insert(e.entity, e.characters);
58 }
59 }
60 mapping
61 });
62
63 ENTITIES_HASH.get(str).copied()
64}
65
66#[allow(clippy::from_str_radix_10)]
67fn replace_entity_pattern(str: &str) -> Option<String> {
68 if let Some(entity) = get_entity_from_str(str) {
69 Some((*entity).to_owned())
70 } else if let Some(captures) = DIGITAL_ENTITY_TEST_RE.captures(str) {
71 let str = captures.get(1).unwrap().as_str();
72 let code = if str.starts_with('x') || str.starts_with('X') {
73 u32::from_str_radix(&str[1..], 16).unwrap()
74 } else {
75 u32::from_str_radix(str, 10).unwrap()
76 };
77
78 if is_valid_entity_code(code) {
79 Some(char::from_u32(code).unwrap().into())
80 } else {
81 None
82 }
83 } else {
84 None
85 }
86}
87
88pub fn unescape_all(str: &str) -> Cow<str> {
95 if !str.contains('\\') && !str.contains('&') { return Cow::Borrowed(str); }
96
97 UNESCAPE_ALL_RE.replace_all(str, |captures: ®ex::Captures| {
98 let s = captures.get(0).unwrap().as_str();
99 if let Some(m) = captures.get(1) {
100 m.as_str().to_owned()
102 } else if let Some(replacement) = replace_entity_pattern(s) {
103 replacement
105 } else {
106 s.to_owned()
107 }
108 })
109}
110
111pub fn escape_html(str: &str) -> Cow<str> {
117 html_escape::encode_double_quoted_attribute(str)
118}
119
120pub fn normalize_reference(str: &str) -> String {
130 static SPACE_RE : Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
131
132 let str = SPACE_RE.replace_all(str.trim(), " ");
135
136 str.to_lowercase().to_uppercase()
169}
170
171pub fn rfind_and_count(source: &str, char: char) -> usize {
182 let mut result = 0;
183 for c in source.chars().rev() {
184 if c == char { break; }
185 result += 1;
186 }
187 result
188}
189
190pub fn find_indent_of(line: &str, mut pos: usize) -> (usize, usize) {
199 let mut chars = line[pos..].chars();
200 let mut indent = 0;
201
202 loop {
203 match chars.next() {
204 Some('\t') => {
205 let bs_count = rfind_and_count(&line[..pos], '\t');
206 indent += 4 - bs_count % 4;
207 pos += 1;
208 }
209 Some(' ') => {
210 indent += 1;
211 pos += 1;
212 }
213 _ => return ( indent, pos ),
214 }
215 }
216}
217
218pub fn cut_right_whitespace_with_tabstops(source: &str, indent: i32) -> Cow<str> {
232 let (num_spaces, start) = calc_right_whitespace_with_tabstops(source, indent);
233
234 if num_spaces > 0 {
235 let mut result = " ".repeat(num_spaces);
236 result += &source[start..];
237 Cow::Owned(result)
238 } else {
239 Cow::Borrowed(&source[start..])
240 }
241}
242
243pub fn calc_right_whitespace_with_tabstops(source: &str, mut indent: i32) -> (usize, usize) {
254 let mut start = source.len();
255 let mut chars = source.char_indices().rev();
256
257 while indent > 0 {
258 match chars.next() {
259 Some((pos, '\t')) => {
260 let indent_from_start = rfind_and_count(&source[..pos], '\t');
263 let tab_width = 4 - indent_from_start as i32 % 4;
264
265 if indent < tab_width {
266 return ( indent as usize, start );
267 }
268
269 indent -= tab_width;
270 start = pos;
271 }
272 Some((pos, _)) => {
273 indent -= 1;
274 start = pos;
275 }
276 None => {
277 start = 0;
278 break;
279 }
280 }
281 }
282
283 ( 0, start )
284}
285
286pub fn is_punct_char(ch: char) -> bool {
293 use unicode_general_category::get_general_category;
294 use unicode_general_category::GeneralCategory::*;
295
296 match get_general_category(ch) {
297 ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation |
299 InitialPunctuation | FinalPunctuation | OtherPunctuation => true,
300
301 UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter |
303 NonspacingMark | SpacingMark | EnclosingMark |
305 DecimalNumber | LetterNumber | OtherNumber |
307 MathSymbol | CurrencySymbol | ModifierSymbol | OtherSymbol |
309 SpaceSeparator | LineSeparator | ParagraphSeparator |
311 Control | Format | Surrogate | PrivateUse | Unassigned => false
313 }
314}
315
316#[cfg(test)]
317mod tests {
318 use super::cut_right_whitespace_with_tabstops as cut_ws;
319 use super::rfind_and_count;
320 use super::find_indent_of;
321 use super::replace_entity_pattern;
322 use super::unescape_all;
323
324 #[test]
325 fn rfind_and_count_test() {
326 assert_eq!(rfind_and_count("", 'b'), 0);
327 assert_eq!(rfind_and_count("abcde", 'e'), 0);
328 assert_eq!(rfind_and_count("abcde", 'b'), 3);
329 assert_eq!(rfind_and_count("abcde", 'z'), 5);
330 assert_eq!(rfind_and_count("abcεπ", 'b'), 3);
331 }
332
333 #[test]
334 fn find_indent_of_simple_test() {
335 assert_eq!(find_indent_of("a", 0), (0, 0));
336 assert_eq!(find_indent_of(" a", 0), (1, 1));
337 assert_eq!(find_indent_of(" a", 0), (3, 3));
338 assert_eq!(find_indent_of(" ", 0), (4, 4));
339 assert_eq!(find_indent_of("\ta", 0), (4, 1));
340 assert_eq!(find_indent_of(" \ta", 0), (4, 2));
341 assert_eq!(find_indent_of(" \ta", 0), (4, 3));
342 assert_eq!(find_indent_of(" \ta", 0), (4, 4));
343 assert_eq!(find_indent_of(" \ta", 0), (8, 5));
344 }
345
346 #[test]
347 fn find_indent_of_with_offset() {
348 assert_eq!(find_indent_of(" a", 2), (1, 3));
349 assert_eq!(find_indent_of(" a", 2), (2, 4));
350 assert_eq!(find_indent_of(" \ta", 2), (2, 3));
351 assert_eq!(find_indent_of(" \ta", 2), (2, 4));
352 assert_eq!(find_indent_of(" \ta", 2), (6, 5));
353 assert_eq!(find_indent_of(" \ta", 2), (6, 6));
354 }
355
356 #[test]
357 fn find_indent_of_tabs_test() {
358 assert_eq!(find_indent_of(" \t \ta", 1), (7, 5));
359 assert_eq!(find_indent_of(" \t \ta", 2), (6, 5));
360 assert_eq!(find_indent_of(" \t \ta", 3), (4, 5));
361 assert_eq!(find_indent_of(" \t \ta", 4), (3, 5));
362 }
363
364 #[test]
365 fn cut_ws_simple() {
366 assert_eq!(cut_ws("abc", -1), "");
367 assert_eq!(cut_ws("abc", 0), "");
368 assert_eq!(cut_ws("abc", 1), "c");
369 assert_eq!(cut_ws("abc", 2), "bc");
370 assert_eq!(cut_ws("abc", 3), "abc");
371 assert_eq!(cut_ws("abc", 4), "abc");
372 }
373
374 #[test]
375 fn cut_ws_unicode() {
376 assert_eq!(cut_ws("αβγδ", 1), "δ");
377 assert_eq!(cut_ws("αβγδ ", 3), "γδ ");
378 }
379
380 #[test]
381 fn cut_ws_expands_partial_tabs() {
382 assert_eq!(cut_ws("\t", 1), " ");
383 assert_eq!(cut_ws("\t", 2), " ");
384 assert_eq!(cut_ws("\t", 3), " ");
385 assert_eq!(cut_ws("\t\t\t", 5), " \t");
386 assert_eq!(cut_ws("\t\t\t", 7), " \t");
387 }
388
389 #[test]
390 fn cut_ws_retains_full_tabs() {
391 assert_eq!(cut_ws("\t\t\t", 4), "\t");
392 assert_eq!(cut_ws("\t\t\t", 8), "\t\t");
393 }
394
395 #[test]
396 fn cut_ws_proper_tabstops() {
397 assert_eq!(cut_ws("a\t", 1), " ");
398 assert_eq!(cut_ws("a\t", 2), " ");
399 assert_eq!(cut_ws("a\t", 3), "\t");
400 assert_eq!(cut_ws("ab\t", 3), "b\t");
401 assert_eq!(cut_ws("abc\t", 3), "bc\t");
402 }
403
404 #[test]
405 fn cut_ws_proper_tabstops_nested() {
406 assert_eq!(cut_ws("a\tb\t", 2), " ");
407 assert_eq!(cut_ws("a\tb\t", 3), "\t");
408 assert_eq!(cut_ws("a\tb\t", 4), "b\t");
409 assert_eq!(cut_ws("a\tb\t", 5), " b\t");
410 assert_eq!(cut_ws("a\tb\t", 6), " b\t");
411 assert_eq!(cut_ws("a\tb\t", 7), "\tb\t");
412 assert_eq!(cut_ws("a\tb\t", 8), "a\tb\t");
413 }
414
415 #[test]
416 fn cut_ws_different_tabstops_nested() {
417 assert_eq!(cut_ws("abc\tde\tf\tg", 3), " g");
418 assert_eq!(cut_ws("abc\tde\tf\tg", 4), "\tg");
419 assert_eq!(cut_ws("abc\tde\tf\tg", 5), "f\tg");
420 assert_eq!(cut_ws("abc\tde\tf\tg", 6), " f\tg");
421 assert_eq!(cut_ws("abc\tde\tf\tg", 7), "\tf\tg");
422 assert_eq!(cut_ws("abc\tde\tf\tg", 9), "de\tf\tg");
423 assert_eq!(cut_ws("abc\tde\tf\tg", 10), "\tde\tf\tg");
424 }
425
426 #[test]
427 fn test_replace_entity_pattern() {
428 assert_eq!(replace_entity_pattern("&"), Some("&".into()));
429 assert_eq!(replace_entity_pattern("€"), Some("€".into()));
430 assert_eq!(replace_entity_pattern("—"), Some("—".into()));
431 assert_eq!(replace_entity_pattern("—"), Some("—".into()));
432 assert_eq!(replace_entity_pattern(" "), Some(" ".into()));
433 assert_eq!(replace_entity_pattern("?"), Some("?".into()));
434 assert_eq!(replace_entity_pattern("&ffff;"), None);
435 assert_eq!(replace_entity_pattern("F;"), None);
436 assert_eq!(replace_entity_pattern("&#xGG;"), None);
437 }
438
439 #[test]
440 fn test_unescape_all_simple() {
441 assert_eq!(unescape_all("&"), "&");
442 assert_eq!(unescape_all("\\&"), "&");
443 }
444
445 #[test]
446 fn test_unescape_all_xss() {
447 assert_eq!(
448 unescape_all(r#"javascript:alert(1)"#),
449 r#"javascript:alert(1)"#);
450
451 assert_eq!(
452 unescape_all(r#"Javascript:alert(1)"#),
453 r#"Javascript:alert(1)"#);
454
455 assert_eq!(
456 unescape_all(r#"&#74;avascript:alert(1)"#),
457 r#"Javascript:alert(1)"#);
458
459 assert_eq!(
460 unescape_all(r#"\Javascript:alert(1)"#),
461 r#"Javascript:alert(1)"#);
462
463 assert_eq!(
464 unescape_all(r#""><script>alert("xss")</script>"#),
465 r#""><script>alert("xss")</script>"#);
466 }
467}